bulk move of code to src/ SERVER-4551

author: Eliot Horowitz <eliot@10gen.com> 2011-12-24 15:33:26 -0500
committer: Eliot Horowitz <eliot@10gen.com> 2011-12-24 15:33:45 -0500
commit: ae1ecd9c786911f9f1f0242f0f7d702b3e5dfeba (patch)
tree: 92f8e1649e6f080b251ff5f1763679a72eb59b34 /src/mongo/db
parent: dfa4cd7e2cf109b072440155fabc08a93c8045a0 (diff)
download: mongo-ae1ecd9c786911f9f1f0242f0f7d702b3e5dfeba.tar.gz
237 files changed, 76762 insertions, 0 deletions
diff --git a/src/mongo/db/background.h b/src/mongo/db/background.h
new file mode 100644
index 00000000000..ea424c97107
--- /dev/null
+++ b/src/mongo/db/background.h
@@ -0,0 +1,56 @@
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* background.h
+
+   Concurrency coordination for administrative operations.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    /* these are administrative operations / jobs
+       for a namespace running in the background, and that only one
+       at a time per namespace is permitted, and that if in progress,
+       you aren't allowed to do other NamespaceDetails major manipulations
+       (such as dropping ns or db) even in the foreground and must
+       instead uassert.
+
+       It's assumed this is not for super-high RPS things, so we don't do
+       anything special in the implementation here to be fast.
+    */
+    class BackgroundOperation : public boost::noncopyable {
+    public:
+        static bool inProgForDb(const char *db);
+        static bool inProgForNs(const char *ns);
+        static void assertNoBgOpInProgForDb(const char *db);
+        static void assertNoBgOpInProgForNs(const char *ns);
+        static void dump(stringstream&);
+
+        /* check for in progress before instantiating */
+        BackgroundOperation(const char *ns);
+
+        virtual ~BackgroundOperation();
+
+    private:
+        NamespaceString _ns;
+        static map<string, unsigned> dbsInProg;
+        static set<string> nsInProg;
+    };
+
+} // namespace mongo
+
diff --git a/src/mongo/db/btree.cpp b/src/mongo/db/btree.cpp
new file mode 100644
index 00000000000..5c55fad33c3
--- /dev/null
+++ b/src/mongo/db/btree.cpp
@@ -0,0 +1,1980 @@
+// btree.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "json.h"
+#include "clientcursor.h"
+#include "client.h"
+#include "dbhelpers.h"
+#include "curop-inl.h"
+#include "stats/counters.h"
+#include "dur_commitjob.h"
+#include "btreebuilder.h"
+#include "../util/unittest.h"
+#include "../server.h"
+
+namespace mongo {
+
+    BOOST_STATIC_ASSERT( Record::HeaderSize == 16 );
+    BOOST_STATIC_ASSERT( Record::HeaderSize + BtreeData_V1::BucketSize == 8192 );
+
+    NOINLINE_DECL void checkFailed(unsigned line) {
+        static time_t last;
+        if( time(0) - last >= 10 ) { 
+            msgasserted(15898, str::stream() << "error in index possibly corruption consider repairing " << line);
+        }
+    }
+
+    /** data check. like assert, but gives a reasonable error message to the user. */
+#define check(expr) if(!(expr) ) { checkFailed(__LINE__); }
+
+#define VERIFYTHISLOC dassert( thisLoc.btree<V>() == this );
+
+    template< class Loc >
+    __KeyNode<Loc> & __KeyNode<Loc>::writing() const {
+        return *getDur().writing( const_cast< __KeyNode<Loc> * >( this ) );
+    }
+
+    // BucketBasics::lowWaterMark()
+    //
+    // We define this value as the maximum number of bytes such that, if we have
+    // fewer than this many bytes, we must be able to either merge with or receive
+    // keys from any neighboring node.  If our utilization goes below this value we
+    // know we can bring up the utilization with a simple operation.  Ignoring the
+    // 90/10 split policy which is sometimes employed and our 'unused' nodes, this
+    // is a lower bound on bucket utilization for non root buckets.
+    //
+    // Note that the exact value here depends on the implementation of
+    // rebalancedSeparatorPos().  The conditions for lowWaterMark - 1 are as
+    // follows:  We know we cannot merge with the neighbor, so the total data size
+    // for us, the neighbor, and the separator must be at least
+    // BtreeBucket<V>::bodySize() + 1.  We must be able to accept one key of any
+    // allowed size, so our size plus storage for that additional key must be
+    // <= BtreeBucket<V>::bodySize() / 2.  This way, with the extra key we'll have a
+    // new bucket data size < half the total data size and by the implementation
+    // of rebalancedSeparatorPos() the key must be added.
+
+    static const int split_debug = 0;
+    static const int insert_debug = 0;
+
+    /**
+     * this error is ok/benign when doing a background indexing -- that logic in pdfile checks explicitly
+     * for the 10287 error code.
+     */
+    static void alreadyInIndex() {
+        // we don't use massert() here as that does logging and this is 'benign' - see catches in _indexRecord()
+        throw MsgAssertionException(10287, "btree: key+recloc already in index");
+    }
+
+    /* BucketBasics --------------------------------------------------- */
+
+    template< class V >
+    void BucketBasics<V>::assertWritable() {
+        if( cmdLine.dur )
+	  dur::assertAlreadyDeclared(this, V::BucketSize);
+    }
+
+    template< class V >
+    string BtreeBucket<V>::bucketSummary() const {
+        stringstream ss;
+        ss << "  Bucket info:" << endl;
+        ss << "    n: " << this->n << endl;
+        ss << "    parent: " << this->parent.toString() << endl;
+        ss << "    nextChild: " << this->nextChild.toString() << endl;
+        ss << "    flags:" << this->flags << endl;
+        ss << "    emptySize: " << this->emptySize << " topSize: " << this->topSize << endl;
+        return ss.str();
+    }
+
+    template< class V >
+    int BucketBasics<V>::Size() const {
+        return V::BucketSize;
+    }
+
+    template< class V >
+    void BucketBasics<V>::_shape(int level, stringstream& ss) const {
+        for ( int i = 0; i < level; i++ ) ss << ' ';
+        ss << "*[" << this->n << "]\n";
+        for ( int i = 0; i < this->n; i++ ) {
+            if ( !k(i).prevChildBucket.isNull() ) {
+                DiskLoc ll = k(i).prevChildBucket;
+                ll.btree<V>()->_shape(level+1,ss);
+            }
+        }
+        if ( !this->nextChild.isNull() ) {
+            DiskLoc ll = this->nextChild;
+            ll.btree<V>()->_shape(level+1,ss);
+        }
+    }
+
+    int bt_fv=0;
+    int bt_dmp=0;
+
+    template< class V >
+    void BtreeBucket<V>::dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const {
+        bt_dmp=1;
+        fullValidate(thisLoc, order);
+        bt_dmp=0;
+    }
+
+    template< class V >
+    long long BtreeBucket<V>::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount, bool strict, unsigned depth) const {
+        {
+            bool f = false;
+            assert( f = true );
+            massert( 10281 , "assert is misdefined", f);
+        }
+
+        killCurrentOp.checkForInterrupt();
+        this->assertValid(order, true);
+
+        if ( bt_dmp ) {
+            _log() << thisLoc.toString() << ' ';
+            ((BtreeBucket *) this)->dump(depth);
+        }
+
+        // keycount
+        long long kc = 0;
+
+        for ( int i = 0; i < this->n; i++ ) {
+            const _KeyNode& kn = this->k(i);
+
+            if ( kn.isUsed() ) {
+                kc++;
+            }
+            else {
+                if ( unusedCount ) {
+                    ++( *unusedCount );
+                }
+            }
+            if ( !kn.prevChildBucket.isNull() ) {
+                DiskLoc left = kn.prevChildBucket;
+                const BtreeBucket *b = left.btree<V>();
+                if ( strict ) {
+                    assert( b->parent == thisLoc );
+                }
+                else {
+                    wassert( b->parent == thisLoc );
+                }
+                kc += b->fullValidate(kn.prevChildBucket, order, unusedCount, strict, depth+1);
+            }
+        }
+        if ( !this->nextChild.isNull() ) {
+	    DiskLoc ll = this->nextChild;
+            const BtreeBucket *b = ll.btree<V>();
+            if ( strict ) {
+                assert( b->parent == thisLoc );
+            }
+            else {
+                wassert( b->parent == thisLoc );
+            }
+            kc += b->fullValidate(this->nextChild, order, unusedCount, strict, depth+1);
+        }
+
+        return kc;
+    }
+
+    int nDumped = 0;
+
+    template< class V >
+    void BucketBasics<V>::assertValid(const Ordering &order, bool force) const {
+        if ( !debug && !force )
+            return;
+        {
+            int foo = this->n;
+            wassert( foo >= 0 && this->n < Size() );
+            foo = this->emptySize;
+            wassert( foo >= 0 && this->emptySize < V::BucketSize );
+            wassert( this->topSize >= this->n && this->topSize <= V::BucketSize );
+        }
+
+        // this is very slow so don't do often
+        {
+            static int _k;
+            if( ++_k % 128 )
+                return;
+        }
+
+        DEV {
+            // slow:
+            for ( int i = 0; i < this->n-1; i++ ) {
+                Key k1 = keyNode(i).key;
+                Key k2 = keyNode(i+1).key;
+                int z = k1.woCompare(k2, order); //OK
+                if ( z > 0 ) {
+                    out() << "ERROR: btree key order corrupt.  Keys:" << endl;
+                    if ( ++nDumped < 5 ) {
+                        for ( int j = 0; j < this->n; j++ ) {
+                            out() << "  " << keyNode(j).key.toString() << endl;
+                        }
+                        ((BtreeBucket<V> *) this)->dump();
+                    }
+                    wassert(false);
+                    break;
+                }
+                else if ( z == 0 ) {
+                    if ( !(k(i).recordLoc < k(i+1).recordLoc) ) {
+                        out() << "ERROR: btree key order corrupt (recordloc's wrong):" << endl;
+                        out() << " k(" << i << ")" << keyNode(i).key.toString() << " RL:" << k(i).recordLoc.toString() << endl;
+                        out() << " k(" << i+1 << ")" << keyNode(i+1).key.toString() << " RL:" << k(i+1).recordLoc.toString() << endl;
+                        wassert( k(i).recordLoc < k(i+1).recordLoc );
+                    }
+                }
+            }
+        }
+        else {
+            //faster:
+            if ( this->n > 1 ) {
+                Key k1 = keyNode(0).key;
+                Key k2 = keyNode(this->n-1).key;
+                int z = k1.woCompare(k2, order);
+                //wassert( z <= 0 );
+                if ( z > 0 ) {
+                    problem() << "btree keys out of order" << '\n';
+                    ONCE {
+                        ((BtreeBucket<V> *) this)->dump();
+                    }
+                    assert(false);
+                }
+            }
+        }
+    }
+
+    template< class V >
+    inline void BucketBasics<V>::markUnused(int keypos) {
+        assert( keypos >= 0 && keypos < this->n );
+        k(keypos).setUnused();
+    }
+
+    template< class V >
+    inline int BucketBasics<V>::totalDataSize() const {
+        return (int) (Size() - (this->data-(char*)this));
+    }
+
+    template< class V >
+    void BucketBasics<V>::init() {
+        this->_init();
+        this->parent.Null();
+        this->nextChild.Null();
+        this->flags = Packed;
+        this->n = 0;
+        this->emptySize = totalDataSize();
+        this->topSize = 0;
+    }
+
+    /** see _alloc */
+    template< class V >
+    inline void BucketBasics<V>::_unalloc(int bytes) {
+        this->topSize -= bytes;
+        this->emptySize += bytes;
+    }
+
+    /**
+     * we allocate space from the end of the buffer for data.
+     * the keynodes grow from the front.
+     */
+    template< class V >
+    inline int BucketBasics<V>::_alloc(int bytes) {
+        assert( this->emptySize >= bytes );
+        this->topSize += bytes;
+        this->emptySize -= bytes;
+        int ofs = totalDataSize() - this->topSize;
+        assert( ofs > 0 );
+        return ofs;
+    }
+
+    template< class V >
+    void BucketBasics<V>::_delKeyAtPos(int keypos, bool mayEmpty) {
+        // TODO This should be keypos < n
+        assert( keypos >= 0 && keypos <= this->n );
+        assert( childForPos(keypos).isNull() );
+        // TODO audit cases where nextChild is null
+        assert( ( mayEmpty && this->n > 0 ) || this->n > 1 || this->nextChild.isNull() );
+        this->emptySize += sizeof(_KeyNode);
+        this->n--;
+        for ( int j = keypos; j < this->n; j++ )
+            k(j) = k(j+1);
+        setNotPacked();
+    }
+
+    /**
+     * pull rightmost key from the bucket.  this version requires its right child to be null so it
+     *  does not bother returning that value.
+     */
+    template< class V >
+    void BucketBasics<V>::popBack(DiskLoc& recLoc, Key &key) {
+        massert( 10282 ,  "n==0 in btree popBack()", this->n > 0 );
+        assert( k(this->n-1).isUsed() ); // no unused skipping in this function at this point - btreebuilder doesn't require that
+        KeyNode kn = keyNode(this->n-1);
+        recLoc = kn.recordLoc;
+        key.assign(kn.key);
+        int keysize = kn.key.dataSize();
+
+        massert( 10283 , "rchild not null in btree popBack()", this->nextChild.isNull());
+
+        // weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full.
+        this->nextChild = kn.prevChildBucket;
+
+        this->n--;
+        // This is risky because the key we are returning points to this unalloc'ed memory,
+        // and we are assuming that the last key points to the last allocated
+        // bson region.
+        this->emptySize += sizeof(_KeyNode);
+        _unalloc(keysize);
+    }
+
+    /** add a key.  must be > all existing.  be careful to set next ptr right. */
+    template< class V >
+    bool BucketBasics<V>::_pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) {
+        int bytesNeeded = key.dataSize() + sizeof(_KeyNode);
+        if ( bytesNeeded > this->emptySize )
+            return false;
+        assert( bytesNeeded <= this->emptySize );
+        if( this->n ) {
+            const KeyNode klast = keyNode(this->n-1);
+            if(  klast.key.woCompare(key, order) > 0 ) { 
+                log() << "btree bucket corrupt? consider reindexing or running validate command" << endl;
+                log() << "  klast: " << keyNode(this->n-1).key.toString() << endl;
+                log() << "  key:   " << key.toString() << endl;
+                DEV klast.key.woCompare(key, order);
+                assert(false);
+            }
+        }
+        this->emptySize -= sizeof(_KeyNode);
+        _KeyNode& kn = k(this->n++);
+        kn.prevChildBucket = prevChild;
+        kn.recordLoc = recordLoc;
+        kn.setKeyDataOfs( (short) _alloc(key.dataSize()) );
+        short ofs = kn.keyDataOfs();
+        char *p = dataAt(ofs);
+        memcpy(p, key.data(), key.dataSize());
+
+        return true;
+    }
+
+    /* durability note
+       we do separate intent declarations herein.  arguably one could just declare
+       the whole bucket given we do group commits. this is something we could investigate
+       later as to what is faster under what situations.
+       */
+    /** insert a key in a bucket with no complexity -- no splits required
+        @return false if a split is required.
+    */
+    template< class V >
+    bool BucketBasics<V>::basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const {
+        check( this->n < 1024 );
+        check( keypos >= 0 && keypos <= this->n );
+        int bytesNeeded = key.dataSize() + sizeof(_KeyNode);
+        if ( bytesNeeded > this->emptySize ) {
+            _pack(thisLoc, order, keypos);
+            if ( bytesNeeded > this->emptySize )
+                return false;
+        }
+
+        BucketBasics *b;
+        {
+            const char *p = (const char *) &k(keypos);
+            const char *q = (const char *) &k(this->n+1);
+            // declare that we will write to [k(keypos),k(n)]
+            // todo: this writes a medium amount to the journal.  we may want to add a verb "shift" to the redo log so
+            //       we can log a very small amount.
+            b = (BucketBasics*) getDur().writingAtOffset((void *) this, p-(char*)this, q-p);
+
+            // e.g. n==3, keypos==2
+            // 1 4 9
+            // ->
+            // 1 4 _ 9
+            for ( int j = this->n; j > keypos; j-- ) // make room
+                b->k(j) = b->k(j-1);
+        }
+
+        getDur().declareWriteIntent(&b->emptySize, sizeof(this->emptySize)+sizeof(this->topSize)+sizeof(this->n));
+        b->emptySize -= sizeof(_KeyNode);
+        b->n++;
+
+        // This _KeyNode was marked for writing above.
+        _KeyNode& kn = b->k(keypos);
+        kn.prevChildBucket.Null();
+        kn.recordLoc = recordLoc;
+        kn.setKeyDataOfs((short) b->_alloc(key.dataSize()) );
+        char *p = b->dataAt(kn.keyDataOfs());
+        getDur().declareWriteIntent(p, key.dataSize());
+        memcpy(p, key.data(), key.dataSize());
+        return true;
+    }
+
+    /**
+     * With this implementation, refPos == 0 disregards effect of refPos.
+     * index > 0 prevents creation of an empty bucket.
+     */
+    template< class V >
+    bool BucketBasics<V>::mayDropKey( int index, int refPos ) const {
+        return index > 0 && ( index != refPos ) && k( index ).isUnused() && k( index ).prevChildBucket.isNull();
+    }
+
+    template< class V >
+    int BucketBasics<V>::packedDataSize( int refPos ) const {
+        if ( this->flags & Packed ) {
+	  return V::BucketSize - this->emptySize - headerSize();
+        }
+        int size = 0;
+        for( int j = 0; j < this->n; ++j ) {
+            if ( mayDropKey( j, refPos ) ) {
+                continue;
+            }
+            size += keyNode( j ).key.dataSize() + sizeof( _KeyNode );
+        }
+        return size;
+    }
+
+    /**
+     * when we delete things we just leave empty space until the node is
+     * full and then we repack it.
+     */
+    template< class V >
+    void BucketBasics<V>::_pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const {
+        if ( this->flags & Packed )
+            return;
+
+        VERIFYTHISLOC
+
+        /** TODO perhaps this can be optimized.  for example if packing does no write, we can skip intent decl.
+                 an empirical approach is probably best than just adding new code : perhaps the bucket would need
+                 declaration anyway within the group commit interval, in which case we would just be adding
+                 code and complexity without benefit.
+        */
+        thisLoc.btreemod<V>()->_packReadyForMod(order, refPos);
+    }
+
+    /** version when write intent already declared */
+    template< class V >
+    void BucketBasics<V>::_packReadyForMod( const Ordering &order, int &refPos ) {
+        assertWritable();
+
+        if ( this->flags & Packed )
+            return;
+
+        int tdz = totalDataSize();
+        char temp[V::BucketSize];
+        int ofs = tdz;
+        this->topSize = 0;
+        int i = 0;
+        for ( int j = 0; j < this->n; j++ ) {
+            if( mayDropKey( j, refPos ) ) {
+                continue; // key is unused and has no children - drop it
+            }
+            if( i != j ) {
+                if ( refPos == j ) {
+                    refPos = i; // i < j so j will never be refPos again
+                }
+                k( i ) = k( j );
+            }
+            short ofsold = k(i).keyDataOfs();
+            int sz = keyNode(i).key.dataSize();
+            ofs -= sz;
+            this->topSize += sz;
+            memcpy(temp+ofs, dataAt(ofsold), sz);
+            k(i).setKeyDataOfsSavingUse( ofs );
+            ++i;
+        }
+        if ( refPos == this->n ) {
+            refPos = i;
+        }
+        this->n = i;
+        int dataUsed = tdz - ofs;
+        memcpy(this->data + ofs, temp + ofs, dataUsed);
+
+        // assertWritable();
+        // TEMP TEST getDur().declareWriteIntent(this, sizeof(*this));
+
+        this->emptySize = tdz - dataUsed - this->n * sizeof(_KeyNode);
+        {
+            int foo = this->emptySize;
+            assert( foo >= 0 );
+        }
+
+        setPacked();
+
+        assertValid( order );
+    }
+
+    template< class V >
+    inline void BucketBasics<V>::truncateTo(int N, const Ordering &order, int &refPos) {
+        d.dbMutex.assertWriteLocked();
+        assertWritable();
+
+        this->n = N;
+        setNotPacked();
+        _packReadyForMod( order, refPos );
+    }
+
+    /**
+     * In the standard btree algorithm, we would split based on the
+     * existing keys _and_ the new key.  But that's more work to
+     * implement, so we split the existing keys and then add the new key.
+     *
+     * There are several published heuristic algorithms for doing splits,
+     * but basically what you want are (1) even balancing between the two
+     * sides and (2) a small split key so the parent can have a larger
+     * branching factor.
+     *
+     * We just have a simple algorithm right now: if a key includes the
+     * halfway point (or 10% way point) in terms of bytes, split on that key;
+     * otherwise split on the key immediately to the left of the halfway
+     * point (or 10% point).
+     *
+     * This function is expected to be called on a packed bucket.
+     */
+    template< class V >
+    int BucketBasics<V>::splitPos( int keypos ) const {
+        assert( this->n > 2 );
+        int split = 0;
+        int rightSize = 0;
+        // when splitting a btree node, if the new key is greater than all the other keys, we should not do an even split, but a 90/10 split.
+        // see SERVER-983
+        // TODO I think we only want to do the 90% split on the rhs node of the tree.
+        int rightSizeLimit = ( this->topSize + sizeof( _KeyNode ) * this->n ) / ( keypos == this->n ? 10 : 2 );
+        for( int i = this->n - 1; i > -1; --i ) {
+            rightSize += keyNode( i ).key.dataSize() + sizeof( _KeyNode );
+            if ( rightSize > rightSizeLimit ) {
+                split = i;
+                break;
+            }
+        }
+        // safeguards - we must not create an empty bucket
+        if ( split < 1 ) {
+            split = 1;
+        }
+        else if ( split > this->n - 2 ) {
+            split = this->n - 2;
+        }
+
+        return split;
+    }
+
+    template< class V >
+    void BucketBasics<V>::reserveKeysFront( int nAdd ) {
+        assert( this->emptySize >= int( sizeof( _KeyNode ) * nAdd ) );
+        this->emptySize -= sizeof( _KeyNode ) * nAdd;
+        for( int i = this->n - 1; i > -1; --i ) {
+            k( i + nAdd ) = k( i );
+        }
+        this->n += nAdd;
+    }
+
+    template< class V >
+    void BucketBasics<V>::setKey( int i, const DiskLoc recordLoc, const Key &key, const DiskLoc prevChildBucket ) {
+        _KeyNode &kn = k( i );
+        kn.recordLoc = recordLoc;
+        kn.prevChildBucket = prevChildBucket;
+        short ofs = (short) _alloc( key.dataSize() );
+        kn.setKeyDataOfs( ofs );
+        char *p = dataAt( ofs );
+        memcpy( p, key.data(), key.dataSize() );
+    }
+
+    template< class V >
+    void BucketBasics<V>::dropFront( int nDrop, const Ordering &order, int &refpos ) {
+        for( int i = nDrop; i < this->n; ++i ) {
+            k( i - nDrop ) = k( i );
+        }
+        this->n -= nDrop;
+        setNotPacked();
+        _packReadyForMod( order, refpos );
+    }
+
+    /* - BtreeBucket --------------------------------------------------- */
+
+    /** @return largest key in the subtree. */
+    template< class V >
+    void BtreeBucket<V>::findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey) {
+        DiskLoc loc = thisLoc;
+        while ( 1 ) {
+            const BtreeBucket *b = loc.btree<V>();
+            if ( !b->nextChild.isNull() ) {
+                loc = b->nextChild;
+                continue;
+            }
+
+            assert(b->n>0);
+            largestLoc = loc;
+            largestKey = b->n-1;
+
+            break;
+        }
+    }
+
+    /**
+     * NOTE Currently the Ordering implementation assumes a compound index will
+     * not have more keys than an unsigned variable has bits.  The same
+     * assumption is used in the implementation below with respect to the 'mask'
+     * variable.
+     *
+     * @param l a regular bsonobj
+     * @param rBegin composed partly of an existing bsonobj, and the remaining keys are taken from a vector of elements that frequently changes 
+     *
+     * see 
+     *  jstests/index_check6.js
+     *  https://jira.mongodb.org/browse/SERVER-371
+     */
+    /* static */
+    template< class V >
+    int BtreeBucket<V>::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ) {
+        BSONObjIterator ll( l );
+        BSONObjIterator rr( rBegin );
+        vector< const BSONElement * >::const_iterator rr2 = rEnd.begin();
+        vector< bool >::const_iterator inc = rEndInclusive.begin();
+        unsigned mask = 1;
+        for( int i = 0; i < rBeginLen; ++i, mask <<= 1 ) {
+            BSONElement lll = ll.next();
+            BSONElement rrr = rr.next();
+            ++rr2;
+            ++inc;
+
+            int x = lll.woCompare( rrr, false );
+            if ( o.descending( mask ) )
+                x = -x;
+            if ( x != 0 )
+                return x;
+        }
+        if ( rSup ) {
+            return -direction;
+        }
+        for( ; ll.more(); mask <<= 1 ) {
+            BSONElement lll = ll.next();
+            BSONElement rrr = **rr2;
+            ++rr2;
+            int x = lll.woCompare( rrr, false );
+            if ( o.descending( mask ) )
+                x = -x;
+            if ( x != 0 )
+                return x;
+            if ( !*inc ) {
+                return -direction;
+            }
+            ++inc;
+        }
+        return 0;
+    }
+
+    template< class V >
+    bool BtreeBucket<V>::exists(const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order) const {
+            int pos;
+            bool found;
+            DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc);
+
+            // skip unused keys
+            while ( 1 ) {
+                if( b.isNull() )
+                    break;
+                const BtreeBucket *bucket = b.btree<V>();
+                const _KeyNode& kn = bucket->k(pos);
+                if ( kn.isUsed() )
+                    return bucket->keyAt(pos).woEqual(key);
+            b = bucket->advance(b, pos, 1, "BtreeBucket<V>::exists");
+        }
+        return false;
+    }
+
+    template< class V >
+    bool BtreeBucket<V>::wouldCreateDup(
+        const IndexDetails& idx, const DiskLoc &thisLoc,
+        const Key& key, const Ordering& order,
+        const DiskLoc &self) const {
+        int pos;
+        bool found;
+        DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc);
+
+        while ( !b.isNull() ) {
+            // we skip unused keys
+            const BtreeBucket *bucket = b.btree<V>();
+            const _KeyNode& kn = bucket->k(pos);
+            if ( kn.isUsed() ) {
+                if( bucket->keyAt(pos).woEqual(key) )
+                    return kn.recordLoc != self;
+                break;
+            }
+            b = bucket->advance(b, pos, 1, "BtreeBucket<V>::dupCheck");
+        }
+
+        return false;
+    }
+
+    template< class V >
+    string BtreeBucket<V>::dupKeyError( const IndexDetails& idx , const Key& key ) {
+        stringstream ss;
+        ss << "E11000 duplicate key error ";
+        ss << "index: " << idx.indexNamespace() << "  ";
+        ss << "dup key: " << key.toString();
+        return ss.str();
+    }
+
+    /**
+     * Find a key withing this btree bucket.
+     *
+     * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the
+     * key.  That assures that even when there are many duplicates (e.g., 1 million) for a key,
+     * our performance is still good.
+     *
+     * assertIfDup: if the key exists (ignoring the recordLoc), uassert
+     *
+     * pos: for existing keys k0...kn-1.
+     * returns # it goes BEFORE.  so key[pos-1] < key < key[pos]
+     * returns n if it goes after the last existing key.
+     * note result might be an Unused location!
+     */
+
+    bool guessIncreasing = false;
+    template< class V >
+    bool BtreeBucket<V>::find(const IndexDetails& idx, const Key& key, const DiskLoc &rl, 
+			      const Ordering &order, int& pos, bool assertIfDup) const {
+        Loc recordLoc;
+        recordLoc = rl;
+        globalIndexCounters.btree( (char*)this );
+
+        // binary search for this key
+        bool dupsChecked = false;
+        int l=0;
+        int h=this->n-1;
+        int m = (l+h)/2;
+        if( guessIncreasing ) {
+            m = h;
+        }
+        while ( l <= h ) {
+            KeyNode M = this->keyNode(m);
+            int x = key.woCompare(M.key, order);
+            if ( x == 0 ) {
+                if( assertIfDup ) {
+                    if( k(m).isUnused() ) {
+                        // ok that key is there if unused.  but we need to check that there aren't other
+                        // entries for the key then.  as it is very rare that we get here, we don't put any
+                        // coding effort in here to make this particularly fast
+                        if( !dupsChecked ) {
+                            dupsChecked = true;
+                            if( idx.head.btree<V>()->exists(idx, idx.head, key, order) ) {
+                                if( idx.head.btree<V>()->wouldCreateDup(idx, idx.head, key, order, recordLoc) )
+                                    uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) );
+                                else
+                                    alreadyInIndex();
+                            }
+                        }
+                    }
+                    else {
+                        if( M.recordLoc == recordLoc )
+                            alreadyInIndex();
+                        uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) );
+                    }
+                }
+
+                // dup keys allowed.  use recordLoc as if it is part of the key
+                Loc unusedRL = M.recordLoc;
+                unusedRL.GETOFS() &= ~1; // so we can test equality without the used bit messing us up
+                x = recordLoc.compare(unusedRL);
+            }
+            if ( x < 0 ) // key < M.key
+                h = m-1;
+            else if ( x > 0 )
+                l = m+1;
+            else {
+                // found it.
+                pos = m;
+                return true;
+            }
+            m = (l+h)/2;
+        }
+        // not found
+        pos = l;
+        if ( pos != this->n ) {
+            Key keyatpos = keyNode(pos).key;
+            wassert( key.woCompare(keyatpos, order) <= 0 );
+            if ( pos > 0 ) {
+                if( !( keyNode(pos-1).key.woCompare(key, order) <= 0 ) ) {
+                    DEV {
+                        log() << key.toString() << endl;
+                        log() << keyNode(pos-1).key.toString() << endl;
+                    }
+                    wassert(false);
+                }
+            }
+        }
+
+        return false;
+    }
+
+    template< class V >
+    void BtreeBucket<V>::delBucket(const DiskLoc thisLoc, const IndexDetails& id) {
+        ClientCursor::informAboutToDeleteBucket(thisLoc); // slow...
+        assert( !isHead() );
+
+	DiskLoc ll = this->parent;
+        const BtreeBucket *p = ll.btree<V>();
+        int parentIdx = indexInParent( thisLoc );
+        p->childForPos( parentIdx ).writing().Null();
+        deallocBucket( thisLoc, id );
+    }
+
+    template< class V >
+    void BtreeBucket<V>::deallocBucket(const DiskLoc thisLoc, const IndexDetails &id) {
+#if 0
+        // as a temporary defensive measure, we zap the whole bucket, AND don't truly delete
+        // it (meaning it is ineligible for reuse).
+        memset(this, 0, Size());
+#else
+        // defensive:
+        this->n = -1;
+        this->parent.Null();
+        string ns = id.indexNamespace();
+        theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), thisLoc.rec(), thisLoc);
+#endif
+    }
+
+    /** note: may delete the entire bucket!  this invalid upon return sometimes. */
+    template< class V >
+    void BtreeBucket<V>::delKeyAtPos( const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order) {
+        assert(this->n>0);
+        DiskLoc left = this->childForPos(p);
+
+        if ( this->n == 1 ) {
+            if ( left.isNull() && this->nextChild.isNull() ) {
+                this->_delKeyAtPos(p);
+                if ( isHead() ) {
+                    // we don't delete the top bucket ever
+                }
+                else {
+                    if ( !mayBalanceWithNeighbors( thisLoc, id, order ) ) {
+                        // An empty bucket is only allowed as a transient state.  If
+                        // there are no neighbors to balance with, we delete ourself.
+                        // This condition is only expected in legacy btrees.
+                        delBucket(thisLoc, id);
+                    }
+                }
+                return;
+            }
+            deleteInternalKey( thisLoc, p, id, order );
+            return;
+        }
+
+        if ( left.isNull() ) {
+            this->_delKeyAtPos(p);
+            mayBalanceWithNeighbors( thisLoc, id, order );
+        }
+        else {
+            deleteInternalKey( thisLoc, p, id, order );
+        }
+    }
+
+    /**
+     * This function replaces the specified key (k) by either the prev or next
+     * key in the btree (k').  We require that k have either a left or right
+     * child.  If k has a left child, we set k' to the prev key of k, which must
+     * be a leaf present in the left child.  If k does not have a left child, we
+     * set k' to the next key of k, which must be a leaf present in the right
+     * child.  When we replace k with k', we copy k' over k (which may cause a
+     * split) and then remove k' from its original location.  Because k' is
+     * stored in a descendent of k, replacing k by k' will not modify the
+     * storage location of the original k', and we can easily remove k' from
+     * its original location.
+     *
+     * This function is only needed in cases where k has a left or right child;
+     * in other cases a simpler key removal implementation is possible.
+     *
+     * NOTE on legacy btree structures:
+     * In legacy btrees, k' can be a nonleaf.  In such a case we 'delete' k by
+     * marking it as an unused node rather than replacing it with k'.  Also, k'
+     * may be a leaf but marked as an unused node.  In such a case we replace
+     * k by k', preserving the key's unused marking.  This function is only
+     * expected to mark a key as unused when handling a legacy btree.
+     */
+    template< class V >
+    void BtreeBucket<V>::deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order ) {
+        DiskLoc lchild = this->childForPos( keypos );
+        DiskLoc rchild = this->childForPos( keypos + 1 );
+        assert( !lchild.isNull() || !rchild.isNull() );
+        int advanceDirection = lchild.isNull() ? 1 : -1;
+        int advanceKeyOfs = keypos;
+        DiskLoc advanceLoc = advance( thisLoc, advanceKeyOfs, advanceDirection, __FUNCTION__ );
+        // advanceLoc must be a descentant of thisLoc, because thisLoc has a
+        // child in the proper direction and all descendants of thisLoc must be
+        // nonempty because they are not the root.
+         
+        if ( !advanceLoc.btree<V>()->childForPos( advanceKeyOfs ).isNull() ||
+                !advanceLoc.btree<V>()->childForPos( advanceKeyOfs + 1 ).isNull() ) {
+            // only expected with legacy btrees, see note above
+            this->markUnused( keypos );
+            return;
+        }
+
+        KeyNode kn = advanceLoc.btree<V>()->keyNode( advanceKeyOfs );
+        // Because advanceLoc is a descendant of thisLoc, updating thisLoc will
+        // not affect packing or keys of advanceLoc and kn will be stable
+        // during the following setInternalKey()
+        setInternalKey( thisLoc, keypos, kn.recordLoc, kn.key, order, this->childForPos( keypos ), this->childForPos( keypos + 1 ), id );
+        advanceLoc.btreemod<V>()->delKeyAtPos( advanceLoc, id, advanceKeyOfs, order );
+    }
+
+//#define BTREE(loc) (static_cast<DiskLoc>(loc).btree<V>())
+#define BTREE(loc) (loc.template btree<V>())
+//#define BTREEMOD(loc) (static_cast<DiskLoc>(loc).btreemod<V>())
+#define BTREEMOD(loc) (loc.template btreemod<V>())
+
+    template< class V >
+    void BtreeBucket<V>::replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ) {
+        assert( this->n == 0 && !this->nextChild.isNull() );
+        if ( this->parent.isNull() ) {
+            assert( id.head == thisLoc );
+            id.head.writing() = this->nextChild;
+        }
+        else {
+	    DiskLoc ll = this->parent;
+            ll.btree<V>()->childForPos( indexInParent( thisLoc ) ).writing() = this->nextChild;
+        }
+        BTREE(this->nextChild)->parent.writing() = this->parent;
+        ClientCursor::informAboutToDeleteBucket( thisLoc );
+        deallocBucket( thisLoc, id );
+    }
+
+    template< class V >
+    bool BtreeBucket<V>::canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const {
+        assert( leftIndex >= 0 && leftIndex < this->n );
+        DiskLoc leftNodeLoc = this->childForPos( leftIndex );
+        DiskLoc rightNodeLoc = this->childForPos( leftIndex + 1 );
+        if ( leftNodeLoc.isNull() || rightNodeLoc.isNull() ) {
+            // TODO if this situation is possible in long term implementation, maybe we should compact somehow anyway
+            return false;
+        }
+        int pos = 0;
+        {
+            const BtreeBucket *l = leftNodeLoc.btree<V>();
+            const BtreeBucket *r = rightNodeLoc.btree<V>();
+            if ( ( this->headerSize() + l->packedDataSize( pos ) + r->packedDataSize( pos ) + keyNode( leftIndex ).key.dataSize() + sizeof(_KeyNode) > unsigned( V::BucketSize ) ) ) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * This implementation must respect the meaning and value of lowWaterMark.
+     * Also see comments in splitPos().
+     */
+    template< class V >
+    int BtreeBucket<V>::rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const {
+        int split = -1;
+        int rightSize = 0;
+        const BtreeBucket *l = BTREE(this->childForPos( leftIndex ));
+        const BtreeBucket *r = BTREE(this->childForPos( leftIndex + 1 ));
+
+        int KNS = sizeof( _KeyNode );
+        int rightSizeLimit = ( l->topSize + l->n * KNS + keyNode( leftIndex ).key.dataSize() + KNS + r->topSize + r->n * KNS ) / 2;
+        // This constraint should be ensured by only calling this function
+        // if we go below the low water mark.
+        assert( rightSizeLimit < BtreeBucket<V>::bodySize() );
+        for( int i = r->n - 1; i > -1; --i ) {
+            rightSize += r->keyNode( i ).key.dataSize() + KNS;
+            if ( rightSize > rightSizeLimit ) {
+                split = l->n + 1 + i;
+                break;
+            }
+        }
+        if ( split == -1 ) {
+            rightSize += keyNode( leftIndex ).key.dataSize() + KNS;
+            if ( rightSize > rightSizeLimit ) {
+                split = l->n;
+            }
+        }
+        if ( split == -1 ) {
+            for( int i = l->n - 1; i > -1; --i ) {
+                rightSize += l->keyNode( i ).key.dataSize() + KNS;
+                if ( rightSize > rightSizeLimit ) {
+                    split = i;
+                    break;
+                }
+            }
+        }
+        // safeguards - we must not create an empty bucket
+        if ( split < 1 ) {
+            split = 1;
+        }
+        else if ( split > l->n + 1 + r->n - 2 ) {
+            split = l->n + 1 + r->n - 2;
+        }
+
+        return split;
+    }
+
+    template< class V >
+    void BtreeBucket<V>::doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) {
+        DiskLoc leftNodeLoc = this->childForPos( leftIndex );
+        DiskLoc rightNodeLoc = this->childForPos( leftIndex + 1 );
+        BtreeBucket *l = leftNodeLoc.btreemod<V>();
+        BtreeBucket *r = rightNodeLoc.btreemod<V>();
+        int pos = 0;
+        l->_packReadyForMod( order, pos );
+        r->_packReadyForMod( order, pos ); // pack r in case there are droppable keys
+
+        // We know the additional keys below will fit in l because canMergeChildren()
+        // must be true.
+        int oldLNum = l->n;
+        {
+            KeyNode kn = keyNode( leftIndex );
+            l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child
+        }
+        for( int i = 0; i < r->n; ++i ) {
+            KeyNode kn = r->keyNode( i );
+            l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket );
+        }
+        l->nextChild = r->nextChild;
+        l->fixParentPtrs( leftNodeLoc, oldLNum );
+        r->delBucket( rightNodeLoc, id );
+        this->childForPos( leftIndex + 1 ) = leftNodeLoc;
+        this->childForPos( leftIndex ) = DiskLoc();
+        this->_delKeyAtPos( leftIndex, true );
+        if ( this->n == 0 ) {
+            // will trash this and thisLoc
+            // TODO To ensure all leaves are of equal height, we should ensure
+            // this is only called on the root.
+            replaceWithNextChild( thisLoc, id );
+        }
+        else {
+            // balance recursively - maybe we should do this even when n == 0?
+            mayBalanceWithNeighbors( thisLoc, id, order );
+        }
+    }
+
+    template< class V >
+    int BtreeBucket<V>::indexInParent( const DiskLoc &thisLoc ) const {
+        assert( !this->parent.isNull() );
+        const BtreeBucket *p = BTREE(this->parent);
+        if ( p->nextChild == thisLoc ) {
+            return p->n;
+        }
+        else {
+            for( int i = 0; i < p->n; ++i ) {
+                if ( p->k( i ).prevChildBucket == thisLoc ) {
+                    return i;
+                }
+            }
+        }
+        out() << "ERROR: can't find ref to child bucket.\n";
+        out() << "child: " << thisLoc << "\n";
+        dump();
+        out() << "Parent: " << this->parent << "\n";
+        p->dump();
+        assert(false);
+        return -1; // just to compile
+    }
+
+    template< class V >
+    bool BtreeBucket<V>::tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const {
+        // If we can merge, then we must merge rather than balance to preserve
+        // bucket utilization constraints.
+        if ( canMergeChildren( thisLoc, leftIndex ) ) {
+            return false;
+        }
+        thisLoc.btreemod<V>()->doBalanceChildren( thisLoc, leftIndex, id, order );
+        return true;
+    }
+
+    template< class V >
+    void BtreeBucket<V>::doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
+                                            BtreeBucket *l, const DiskLoc lchild,
+                                            BtreeBucket *r, const DiskLoc rchild,
+                                            IndexDetails &id, const Ordering &order ) {
+        // TODO maybe do some audits the same way pushBack() does?
+        // As a precondition, rchild + the old separator are <= half a body size,
+        // and lchild is at most completely full.  Based on the value of split,
+        // rchild will get <= half of the total bytes which is at most 75%
+        // of a full body.  So rchild will have room for the following keys:
+        int rAdd = l->n - split;
+        r->reserveKeysFront( rAdd );
+        for( int i = split + 1, j = 0; i < l->n; ++i, ++j ) {
+            KeyNode kn = l->keyNode( i );
+            r->setKey( j, kn.recordLoc, kn.key, kn.prevChildBucket );
+        }
+        {
+            KeyNode kn = keyNode( leftIndex );
+            r->setKey( rAdd - 1, kn.recordLoc, kn.key, l->nextChild ); // left child's right child becomes old parent key's left child
+        }
+        r->fixParentPtrs( rchild, 0, rAdd - 1 );
+        {
+            KeyNode kn = l->keyNode( split );
+            l->nextChild = kn.prevChildBucket;
+            // Because lchild is a descendant of thisLoc, updating thisLoc will
+            // not affect packing or keys of lchild and kn will be stable
+            // during the following setInternalKey()            
+            setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id );
+        }
+        int zeropos = 0;
+        // lchild and rchild cannot be merged, so there must be >0 (actually more)
+        // keys to the left of split.
+        l->truncateTo( split, order, zeropos );
+    }
+
+    template< class V >
+    void BtreeBucket<V>::doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
+                                            BtreeBucket *l, const DiskLoc lchild,
+                                            BtreeBucket *r, const DiskLoc rchild,
+                                            IndexDetails &id, const Ordering &order ) {
+        // As a precondition, lchild + the old separator are <= half a body size,
+        // and rchild is at most completely full.  Based on the value of split,
+        // lchild will get less than half of the total bytes which is at most 75%
+        // of a full body.  So lchild will have room for the following keys:
+        int lN = l->n;
+        {
+            KeyNode kn = keyNode( leftIndex );
+            l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child
+        }
+        for( int i = 0; i < split - lN - 1; ++i ) {
+            KeyNode kn = r->keyNode( i );
+            l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket );
+        }
+        {
+            KeyNode kn = r->keyNode( split - lN - 1 );
+            l->nextChild = kn.prevChildBucket;
+            // Child lN was lchild's old nextChild, and don't need to fix that one.
+            l->fixParentPtrs( lchild, lN + 1, l->n );
+            // Because rchild is a descendant of thisLoc, updating thisLoc will
+            // not affect packing or keys of rchild and kn will be stable
+            // during the following setInternalKey()            
+            setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id );
+        }
+        int zeropos = 0;
+        // lchild and rchild cannot be merged, so there must be >0 (actually more)
+        // keys to the right of split.
+        r->dropFront( split - lN, order, zeropos );
+    }
+
+    template< class V >
+    void BtreeBucket<V>::doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) {
+        DiskLoc lchild = this->childForPos( leftIndex );
+        DiskLoc rchild = this->childForPos( leftIndex + 1 );
+        int zeropos = 0;
+        BtreeBucket *l = lchild.btreemod<V>();
+        l->_packReadyForMod( order, zeropos );
+        BtreeBucket *r = rchild.btreemod<V>();
+        r->_packReadyForMod( order, zeropos );
+        int split = rebalancedSeparatorPos( thisLoc, leftIndex );
+
+        // By definition, if we are below the low water mark and cannot merge
+        // then we must actively balance.
+        assert( split != l->n );
+        if ( split < l->n ) {
+            doBalanceLeftToRight( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order );
+        }
+        else {
+            doBalanceRightToLeft( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order );
+        }
+    }
+
+    template< class V >
+    bool BtreeBucket<V>::mayBalanceWithNeighbors( const DiskLoc thisLoc, IndexDetails &id, const Ordering &order ) const {
+        if ( this->parent.isNull() ) { // we are root, there are no neighbors
+            return false;
+        }
+
+        if ( this->packedDataSize( 0 ) >= this->lowWaterMark() ) {
+            return false;
+        }
+
+        const BtreeBucket *p = BTREE(this->parent);
+        int parentIdx = indexInParent( thisLoc );
+
+        // TODO will missing neighbor case be possible long term?  Should we try to merge/balance somehow in that case if so?
+        bool mayBalanceRight = ( ( parentIdx < p->n ) && !p->childForPos( parentIdx + 1 ).isNull() );
+        bool mayBalanceLeft = ( ( parentIdx > 0 ) && !p->childForPos( parentIdx - 1 ).isNull() );
+
+        // Balance if possible on one side - we merge only if absolutely necessary
+        // to preserve btree bucket utilization constraints since that's a more
+        // heavy duty operation (especially if we must re-split later).
+        if ( mayBalanceRight &&
+                p->tryBalanceChildren( this->parent, parentIdx, id, order ) ) {
+            return true;
+        }
+        if ( mayBalanceLeft &&
+                p->tryBalanceChildren( this->parent, parentIdx - 1, id, order ) ) {
+            return true;
+        }
+
+        BtreeBucket *pm = BTREEMOD(this->parent);
+        if ( mayBalanceRight ) {
+            pm->doMergeChildren( this->parent, parentIdx, id, order );
+            return true;
+        }
+        else if ( mayBalanceLeft ) {
+            pm->doMergeChildren( this->parent, parentIdx - 1, id, order );
+            return true;
+        }
+
+        return false;
+    }
+
+    /** remove a key from the index */
+    template< class V >
+    bool BtreeBucket<V>::unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc ) const {
+        int pos;
+        bool found;
+        const Ordering ord = Ordering::make(id.keyPattern());
+        DiskLoc loc = locate(id, thisLoc, key, ord, pos, found, recordLoc, 1);
+        if ( found ) {
+            if ( key.objsize() > this->KeyMax ) {
+                OCCASIONALLY problem() << "unindex: key too large to index but was found for " << id.indexNamespace() << " reIndex suggested" << endl;
+            }            
+            loc.btreemod<V>()->delKeyAtPos(loc, id, pos, ord);            
+            return true;
+        }
+        return false;
+    }
+
+    template< class V >
+    inline void BtreeBucket<V>::fix(const DiskLoc thisLoc, const DiskLoc child) {
+        if ( !child.isNull() ) {
+            if ( insert_debug )
+                out() << "     fix " << child.toString() << ".parent=" << thisLoc.toString() << endl;
+            child.btree<V>()->parent.writing() = thisLoc;
+        }
+    }
+
+    /**
+     * This can cause a lot of additional page writes when we assign buckets to
+     * different parents.  Maybe get rid of parent ptrs?
+     */
+    template< class V >
+    void BtreeBucket<V>::fixParentPtrs(const DiskLoc thisLoc, int firstIndex, int lastIndex) const {
+        VERIFYTHISLOC
+        if ( lastIndex == -1 ) {
+            lastIndex = this->n;
+        }
+        for ( int i = firstIndex; i <= lastIndex; i++ ) {
+            fix(thisLoc, this->childForPos(i));
+        }
+    }
+
+    template< class V >
+    void BtreeBucket<V>::setInternalKey( const DiskLoc thisLoc, int keypos,
+                                      const DiskLoc recordLoc, const Key &key, const Ordering &order,
+                                      const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx ) {
+        this->childForPos( keypos ).Null();
+
+        // This may leave the bucket empty (n == 0) which is ok only as a
+        // transient state.  In the instant case, the implementation of
+        // insertHere behaves correctly when n == 0 and as a side effect
+        // increments n.
+        this->_delKeyAtPos( keypos, true );
+
+        // Ensure we do not orphan neighbor's old child.
+        assert( this->childForPos( keypos ) == rchild );
+
+        // Just set temporarily - required to pass validation in insertHere()
+        this->childForPos( keypos ) = lchild;
+
+        insertHere( thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx );
+    }
+
+    /**
+     * insert a key in this bucket, splitting if necessary.
+     * @keypos - where to insert the key in range 0..n.  0=make leftmost, n=make rightmost.
+     * NOTE this function may free some data, and as a result the value passed for keypos may
+     * be invalid after calling insertHere()
+     *
+     * Some of the write intent signaling below relies on the implementation of
+     * the optimized write intent code in basicInsert().
+     */
+    template< class V >
+    void BtreeBucket<V>::insertHere( const DiskLoc thisLoc, int keypos,
+                                  const DiskLoc recordLoc, const Key& key, const Ordering& order,
+                                  const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) const {
+        if ( insert_debug )
+            out() << "   " << thisLoc.toString() << ".insertHere " << key.toString() << '/' << recordLoc.toString() << ' '
+                  << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl;
+
+        if ( !this->basicInsert(thisLoc, keypos, recordLoc, key, order) ) {
+            // If basicInsert() fails, the bucket will be packed as required by split().
+            thisLoc.btreemod<V>()->split(thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx);
+            return;
+        }
+
+        {
+            const _KeyNode *_kn = &k(keypos);
+            _KeyNode *kn = (_KeyNode *) getDur().alreadyDeclared((_KeyNode*) _kn); // already declared intent in basicInsert()
+            if ( keypos+1 == this->n ) { // last key
+                if ( this->nextChild != lchild ) {
+                    out() << "ERROR nextChild != lchild" << endl;
+                    out() << "  thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl;
+                    out() << "  keyPos: " << keypos << " n:" << this->n << endl;
+                    out() << "  nextChild: " << this->nextChild.toString() << " lchild: " << lchild.toString() << endl;
+                    out() << "  recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
+                    out() << "  key: " << key.toString() << endl;
+                    dump();
+                    assert(false);
+                }
+                kn->prevChildBucket = this->nextChild;
+                assert( kn->prevChildBucket == lchild );
+                this->nextChild.writing() = rchild;
+                if ( !rchild.isNull() )
+		    BTREE(rchild)->parent.writing() = thisLoc;
+            }
+            else {
+                kn->prevChildBucket = lchild;
+                if ( k(keypos+1).prevChildBucket != lchild ) {
+                    out() << "ERROR k(keypos+1).prevChildBucket != lchild" << endl;
+                    out() << "  thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl;
+                    out() << "  keyPos: " << keypos << " n:" << this->n << endl;
+                    out() << "  k(keypos+1).pcb: " << k(keypos+1).prevChildBucket.toString() << " lchild: " << lchild.toString() << endl;
+                    out() << "  recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
+                    out() << "  key: " << key.toString() << endl;
+                    dump();
+                    assert(false);
+                }
+                const Loc *pc = &k(keypos+1).prevChildBucket;
+                *getDur().alreadyDeclared( const_cast<Loc*>(pc) ) = rchild; // declared in basicInsert()
+                if ( !rchild.isNull() )
+                    rchild.btree<V>()->parent.writing() = thisLoc;
+            }
+            return;
+        }
+    }
+
+    template< class V >
+    void BtreeBucket<V>::split(const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const Key& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) {
+        this->assertWritable();
+
+        if ( split_debug )
+            out() << "    " << thisLoc.toString() << ".split" << endl;
+
+        int split = this->splitPos( keypos );
+        DiskLoc rLoc = addBucket(idx);
+        BtreeBucket *r = rLoc.btreemod<V>();
+        if ( split_debug )
+            out() << "     split:" << split << ' ' << keyNode(split).key.toString() << " n:" << this->n << endl;
+        for ( int i = split+1; i < this->n; i++ ) {
+            KeyNode kn = keyNode(i);
+            r->pushBack(kn.recordLoc, kn.key, order, kn.prevChildBucket);
+        }
+        r->nextChild = this->nextChild;
+        r->assertValid( order );
+
+        if ( split_debug )
+            out() << "     new rLoc:" << rLoc.toString() << endl;
+        r = 0;
+        rLoc.btree<V>()->fixParentPtrs(rLoc);
+
+        {
+            KeyNode splitkey = keyNode(split);
+            this->nextChild = splitkey.prevChildBucket; // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r)
+            if ( split_debug ) {
+                out() << "    splitkey key:" << splitkey.key.toString() << endl;
+            }
+
+            // Because thisLoc is a descendant of parent, updating parent will
+            // not affect packing or keys of thisLoc and splitkey will be stable
+            // during the following:
+            
+            // promote splitkey to a parent this->node
+            if ( this->parent.isNull() ) {
+                // make a new parent if we were the root
+                DiskLoc L = addBucket(idx);
+                BtreeBucket *p = L.btreemod<V>();
+                p->pushBack(splitkey.recordLoc, splitkey.key, order, thisLoc);
+                p->nextChild = rLoc;
+                p->assertValid( order );
+                this->parent = idx.head.writing() = L;
+                if ( split_debug )
+                    out() << "    we were root, making new root:" << hex << this->parent.getOfs() << dec << endl;
+                rLoc.btree<V>()->parent.writing() = this->parent;
+            }
+            else {
+                // set this before calling _insert - if it splits it will do fixParent() logic and change the value.
+                rLoc.btree<V>()->parent.writing() = this->parent;
+                if ( split_debug )
+                    out() << "    promoting splitkey key " << splitkey.key.toString() << endl;
+                BTREE(this->parent)->_insert(this->parent, splitkey.recordLoc, splitkey.key, order, /*dupsallowed*/true, thisLoc, rLoc, idx);
+            }
+        }
+
+        int newpos = keypos;
+        // note this may trash splitkey.key.  thus we had to promote it before finishing up here.
+        this->truncateTo(split, order, newpos);
+
+        // add our this->new key, there is room this->now
+        {
+            if ( keypos <= split ) {
+                if ( split_debug )
+                    out() << "  keypos<split, insertHere() the new key" << endl;
+                insertHere(thisLoc, newpos, recordLoc, key, order, lchild, rchild, idx);
+            }
+            else {
+                int kp = keypos-split-1;
+                assert(kp>=0);
+                BTREE(rLoc)->insertHere(rLoc, kp, recordLoc, key, order, lchild, rchild, idx);
+            }
+        }
+
+        if ( split_debug )
+            out() << "     split end " << hex << thisLoc.getOfs() << dec << endl;
+    }
+
+    /** start a new index off, empty */
+    template< class V >
+    DiskLoc BtreeBucket<V>::addBucket(const IndexDetails& id) {
+        string ns = id.indexNamespace();
+        DiskLoc loc = theDataFileMgr.insert(ns.c_str(), 0, V::BucketSize, true);
+        BtreeBucket *b = BTREEMOD(loc);
+        b->init();
+        return loc;
+    }
+
+    void renameIndexNamespace(const char *oldNs, const char *newNs) {
+        renameNamespace( oldNs, newNs );
+    }
+
+    template< class V >
+    const DiskLoc BtreeBucket<V>::getHead(const DiskLoc& thisLoc) const {
+        DiskLoc p = thisLoc;
+        while ( !BTREE(p)->isHead() )
+            p = BTREE(p)->parent;
+        return p;
+    }
+
+    template< class V >
+    DiskLoc BtreeBucket<V>::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const {
+        if ( keyOfs < 0 || keyOfs >= this->n ) {
+            out() << "ASSERT failure BtreeBucket<V>::advance, caller: " << caller << endl;
+            out() << "  thisLoc: " << thisLoc.toString() << endl;
+            out() << "  keyOfs: " << keyOfs << " n:" << this->n << " direction: " << direction << endl;
+            out() << bucketSummary() << endl;
+            assert(false);
+        }
+        int adj = direction < 0 ? 1 : 0;
+        int ko = keyOfs + direction;
+        DiskLoc nextDown = this->childForPos(ko+adj);
+        if ( !nextDown.isNull() ) {
+            while ( 1 ) {
+	      keyOfs = direction>0 ? 0 : BTREE(nextDown)->n - 1;
+	        DiskLoc loc = BTREE(nextDown)->childForPos(keyOfs + adj);
+                if ( loc.isNull() )
+                    break;
+                nextDown = loc;
+            }
+            return nextDown;
+        }
+
+        if ( ko < this->n && ko >= 0 ) {
+            keyOfs = ko;
+            return thisLoc;
+        }
+
+        // end of bucket.  traverse back up.
+        DiskLoc childLoc = thisLoc;
+        DiskLoc ancestor = this->parent;
+        while ( 1 ) {
+            if ( ancestor.isNull() )
+                break;
+            const BtreeBucket *an = BTREE(ancestor);
+            for ( int i = 0; i < an->n; i++ ) {
+                if ( an->childForPos(i+adj) == childLoc ) {
+                    keyOfs = i;
+                    return ancestor;
+                }
+            }
+            assert( direction<0 || an->nextChild == childLoc );
+            // parent exhausted also, keep going up
+            childLoc = ancestor;
+            ancestor = an->parent;
+        }
+
+        return DiskLoc();
+    }
+
+    template< class V >
+    DiskLoc BtreeBucket<V>::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const {
+        KeyOwned k(key);
+        return locate(idx, thisLoc, k, order, pos, found, recordLoc, direction);
+    }
+
+    template< class V >
+    DiskLoc BtreeBucket<V>::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const Key& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const {
+        int p;
+        found = find(idx, key, recordLoc, order, p, /*assertIfDup*/ false);
+        if ( found ) {
+            pos = p;
+            return thisLoc;
+        }
+
+        DiskLoc child = this->childForPos(p);
+
+        if ( !child.isNull() ) {
+            DiskLoc l = BTREE(child)->locate(idx, child, key, order, pos, found, recordLoc, direction);
+            if ( !l.isNull() )
+                return l;
+        }
+
+        pos = p;
+        if ( direction < 0 )
+            return --pos == -1 ? DiskLoc() /*theend*/ : thisLoc;
+        else
+            return pos == this->n ? DiskLoc() /*theend*/ : thisLoc;
+    }
+
+    template< class V >
+    bool BtreeBucket<V>::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) {
+        const BtreeBucket<V> * bucket = BTREE(thisLoc);
+        while( 1 ) {
+            if ( l + 1 == h ) {
+                keyOfs = ( direction > 0 ) ? h : l;
+                DiskLoc next = bucket->k( h ).prevChildBucket;
+                if ( !next.isNull() ) {
+                    bestParent = make_pair( thisLoc, keyOfs );
+                    thisLoc = next;
+                    return true;
+                }
+                else {
+                    return false;
+                }
+            }
+            int m = l + ( h - l ) / 2;
+            int cmp = customBSONCmp( bucket->keyNode( m ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction );
+            if ( cmp < 0 ) {
+                l = m;
+            }
+            else if ( cmp > 0 ) {
+                h = m;
+            }
+            else {
+                if ( direction < 0 ) {
+                    l = m;
+                }
+                else {
+                    h = m;
+                }
+            }
+        }
+    }
+
+    /**
+     * find smallest/biggest value greater-equal/less-equal than specified
+     * starting thisLoc + keyOfs will be strictly less than/strictly greater than keyBegin/keyBeginLen/keyEnd
+     * All the direction checks below allowed me to refactor the code, but possibly separate forward and reverse implementations would be more efficient
+     */
+    template< class V >
+    void BtreeBucket<V>::advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const {
+        int l,h;
+        bool dontGoUp;
+        if ( direction > 0 ) {
+            l = keyOfs;
+            h = this->n - 1;
+            dontGoUp = ( customBSONCmp( keyNode( h ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 );
+        }
+        else {
+            l = 0;
+            h = keyOfs;
+            dontGoUp = ( customBSONCmp( keyNode( l ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 );
+        }
+        pair< DiskLoc, int > bestParent;
+        if ( dontGoUp ) {
+            // this comparison result assures h > l
+            if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, thisLoc, keyOfs, bestParent ) ) {
+                return;
+            }
+        }
+        else {
+            // go up parents until rightmost/leftmost node is >=/<= target or at top
+	    while( !BTREE(thisLoc)->parent.isNull() ) {
+	        thisLoc = BTREE(thisLoc)->parent;
+                if ( direction > 0 ) {
+		  if ( customBSONCmp( BTREE(thisLoc)->keyNode( BTREE(thisLoc)->n - 1 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ) {
+                        break;
+                    }
+                }
+                else {
+		  if ( customBSONCmp( BTREE(thisLoc)->keyNode( 0 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ) {
+                        break;
+                    }
+                }
+            }
+        }
+        customLocate( thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, bestParent );
+    }
+
+    /** @param thisLoc in/out param. perhaps thisLoc isn't the best name given that.
+        Ut is used by advanceTo, which skips
+        from one key to another key without necessarily checking all the keys
+        between them in the btree (it can skip to different btree buckets).
+        The advanceTo function can get called a lot, and for different targets
+        we want to advance to, don't want to create a bson obj in a new
+        buffer each time we call that function.  The
+        customLocate function necessary for advanceTo, and does the same thing
+        as normal locate function but takes basically the same arguments
+        as advanceTo.
+    */
+    template< class V >
+    void BtreeBucket<V>::customLocate(DiskLoc &locInOut, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, 
+                                      const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, 
+                                      const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) {
+        dassert( direction == 1 || direction == -1 );
+        const BtreeBucket<V> *bucket = BTREE(locInOut);
+        if ( bucket->n == 0 ) {
+            locInOut = DiskLoc();
+            return;
+        }
+        // go down until find smallest/biggest >=/<= target
+        while( 1 ) {
+            int l = 0;
+            int h = bucket->n - 1;
+
+            // +direction: 0, -direction: h
+            int z = (1-direction)/2*h;
+
+            // leftmost/rightmost key may possibly be >=/<= search key
+            int res = customBSONCmp( bucket->keyNode( z ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction );
+            bool firstCheck = direction*res >= 0;
+
+            if ( firstCheck ) {
+                DiskLoc next;
+                keyOfs = z;
+                if ( direction > 0 ) {
+                    dassert( z == 0 );
+                    next = bucket->k( 0 ).prevChildBucket;
+                }
+                else {
+                    next = bucket->nextChild;
+                }
+                if ( !next.isNull() ) {
+                    bestParent = pair< DiskLoc, int >( locInOut, keyOfs );
+                    locInOut = next;
+                    bucket = BTREE(locInOut);
+                    continue;
+                }
+                else {
+                    return;
+                }
+            }
+
+            res = customBSONCmp( bucket->keyNode( h-z ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction );
+            bool secondCheck = direction*res < 0;
+
+            if ( secondCheck ) {
+                DiskLoc next;
+                if ( direction > 0 ) {
+                    next = bucket->nextChild;
+                }
+                else {
+                    next = bucket->k( 0 ).prevChildBucket;
+                }
+                if ( next.isNull() ) {
+                    // if bestParent is null, we've hit the end and locInOut gets set to DiskLoc()
+                    locInOut = bestParent.first;
+                    keyOfs = bestParent.second;
+                    return;
+                }
+                else {
+                    locInOut = next;
+                    bucket = BTREE(locInOut);
+                    continue;
+                }
+            }
+
+            if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, locInOut, keyOfs, bestParent ) ) {
+                return;
+            }
+            bucket = BTREE(locInOut);
+        }
+    }
+
+    /** @thisLoc disk location of *this */
+    template< class V >
+    void BtreeBucket<V>::insertStepOne(DiskLoc thisLoc, 
+                             Continuation<V>& c,
+                             bool dupsAllowed) const {
+        dassert( c.key.dataSize() <= this->KeyMax );
+        assert( c.key.dataSize() > 0 );
+
+        int pos;
+        bool found = find(c.idx, c.key, c.recordLoc, c.order, pos, !dupsAllowed);
+
+        if ( found ) {
+            const _KeyNode& kn = k(pos);
+            if ( kn.isUnused() ) {
+                log(4) << "btree _insert: reusing unused key" << endl;
+                c.b = this;
+                c.pos = pos;
+                c.op = Continuation<V>::SetUsed;
+                return;
+            }
+
+            DEV {
+                log() << "_insert(): key already exists in index (ok for background:true)\n";
+                log() << "  " << c.idx.indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n';
+                log() << "  " << c.key.toString() << '\n';
+                log() << "  " << "recordLoc:" << c.recordLoc.toString() << " pos:" << pos << endl;
+                log() << "  old l r: " << this->childForPos(pos).toString() << ' ' << this->childForPos(pos+1).toString() << endl;
+            }
+            alreadyInIndex();
+        }
+
+        Loc ch = this->childForPos(pos);
+        DiskLoc child = ch;
+
+        if ( child.isNull() ) {
+            // A this->new key will be inserted at the same tree height as an adjacent existing key.
+            c.bLoc = thisLoc;
+            c.b = this;
+            c.pos = pos;
+            c.op = Continuation<V>::InsertHere;
+            return;
+        }
+
+        child.btree<V>()->insertStepOne(child, c, dupsAllowed);
+    }
+
+    /** @thisLoc disk location of *this */
+    template< class V >
+    int BtreeBucket<V>::_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                             const Key& key, const Ordering &order, bool dupsAllowed,
+                             const DiskLoc lChild, const DiskLoc rChild, IndexDetails& idx) const {
+        if ( key.dataSize() > this->KeyMax ) {
+            problem() << "ERROR: key too large len:" << key.dataSize() << " max:" << this->KeyMax << ' ' << key.dataSize() << ' ' << idx.indexNamespace() << endl;
+            return 2;
+        }
+        assert( key.dataSize() > 0 );
+
+        int pos;
+        bool found = find(idx, key, recordLoc, order, pos, !dupsAllowed);
+        if ( insert_debug ) {
+            out() << "  " << thisLoc.toString() << '.' << "_insert " <<
+                  key.toString() << '/' << recordLoc.toString() <<
+                  " l:" << lChild.toString() << " r:" << rChild.toString() << endl;
+            out() << "    found:" << found << " pos:" << pos << " n:" << this->n << endl;
+        }
+
+        if ( found ) {
+            const _KeyNode& kn = k(pos);
+            if ( kn.isUnused() ) {
+                log(4) << "btree _insert: reusing unused key" << endl;
+                massert( 10285 , "_insert: reuse key but lchild is not null", lChild.isNull());
+                massert( 10286 , "_insert: reuse key but rchild is not null", rChild.isNull());
+                kn.writing().setUsed();
+                return 0;
+            }
+
+            DEV {
+                log() << "_insert(): key already exists in index (ok for background:true)\n";
+                log() << "  " << idx.indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n';
+                log() << "  " << key.toString() << '\n';
+                log() << "  " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl;
+                log() << "  old l r: " << this->childForPos(pos).toString() << ' ' << this->childForPos(pos+1).toString() << endl;
+                log() << "  new l r: " << lChild.toString() << ' ' << rChild.toString() << endl;
+            }
+            alreadyInIndex();
+        }
+
+        DEBUGGING out() << "TEMP: key: " << key.toString() << endl;
+        Loc ch = this->childForPos(pos);
+        DiskLoc child = ch;
+        if ( insert_debug )
+            out() << "    getChild(" << pos << "): " << child.toString() << endl;
+        // In current usage, rChild isNull() for a new key and false when we are
+        // promoting a split key.  These are the only two cases where _insert()
+        // is called currently.
+        if ( child.isNull() || !rChild.isNull() ) {
+            // A new key will be inserted at the same tree height as an adjacent existing key.
+            insertHere(thisLoc, pos, recordLoc, key, order, lChild, rChild, idx);
+            return 0;
+        }
+
+        return child.btree<V>()->_insert(child, recordLoc, key, order, dupsAllowed, /*lchild*/DiskLoc(), /*rchild*/DiskLoc(), idx);
+    }
+
+    template< class V >
+    void BtreeBucket<V>::dump(unsigned depth) const {
+        string indent = string(depth, ' ');
+        _log() << "BUCKET n:" << this->n;
+        _log() << " parent:" << hex << this->parent.getOfs() << dec;
+        for ( int i = 0; i < this->n; i++ ) {
+            _log() << '\n' << indent;
+            KeyNode k = keyNode(i);
+            string ks = k.key.toString();
+            _log() << "  " << hex << k.prevChildBucket.getOfs() << '\n';
+            _log() << indent << "    " << i << ' ' << ks.substr(0, 30) << " Loc:" << k.recordLoc.toString() << dec;
+            if ( this->k(i).isUnused() )
+                _log() << " UNUSED";
+        }
+        _log() << "\n" << indent << "  " << hex << this->nextChild.getOfs() << dec << endl;
+    }
+
+    template< class V >
+    void BtreeBucket<V>::twoStepInsert(DiskLoc thisLoc, Continuation<V> &c, bool dupsAllowed) const
+    {
+
+        if ( c.key.dataSize() > this->KeyMax ) {
+            problem() << "ERROR: key too large len:" << c.key.dataSize() << " max:" << this->KeyMax << ' ' << c.key.dataSize() << ' ' << c.idx.indexNamespace() << endl;
+            return; // op=Nothing
+        }
+        insertStepOne(thisLoc, c, dupsAllowed);
+    }
+
+    /** todo: meaning of return code unclear clean up */
+    template< class V >
+    int BtreeBucket<V>::bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                               const BSONObj& _key, const Ordering &order, bool dupsAllowed,
+                               IndexDetails& idx, bool toplevel) const 
+    {
+        guessIncreasing = _key.firstElementType() == jstOID && idx.isIdIndex();
+        KeyOwned key(_key);
+
+        dassert(toplevel); 
+        if ( toplevel ) {
+            if ( key.dataSize() > this->KeyMax ) {
+                problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() << ' ' << key.dataSize() << ' ' << key.toString() << endl;
+                return 3;
+            }
+        }
+
+        int x;
+        try {
+            x = _insert(thisLoc, recordLoc, key, order, dupsAllowed, DiskLoc(), DiskLoc(), idx);
+            this->assertValid( order );
+        }
+        catch( ... ) { 
+            guessIncreasing = false;
+            throw;
+        }
+        guessIncreasing = false;
+        return x;
+    }
+
+    template< class V >
+    void BtreeBucket<V>::shape(stringstream& ss) const {
+        this->_shape(0, ss);
+    }
+
+    template< class V >
+    int BtreeBucket<V>::getKeyMax() {
+        return V::KeyMax;
+    }
+
+    template< class V >
+    DiskLoc BtreeBucket<V>::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const {
+        int pos;
+        bool found;
+        // TODO: is it really ok here that the order is a default?  
+        // for findById() use, yes.  for checkNoIndexConflicts, no?
+        Ordering o = Ordering::make(BSONObj());
+        DiskLoc bucket = locate( indexdetails , indexdetails.head , key , o , pos , found , minDiskLoc );
+        if ( bucket.isNull() )
+            return bucket;
+
+        const BtreeBucket<V> *b = bucket.btree<V>();
+        while ( 1 ) {
+            const _KeyNode& knraw = b->k(pos);
+            if ( knraw.isUsed() )
+                break;
+            bucket = b->advance( bucket , pos , 1 , "findSingle" );
+            if ( bucket.isNull() )
+                return bucket;
+            b = bucket.btree<V>();
+        }
+        KeyNode kn = b->keyNode( pos );
+        if ( KeyOwned(key).woCompare( kn.key, o ) != 0 )
+            return DiskLoc();
+        return kn.recordLoc;
+    }
+
+} // namespace mongo
+
+#include "db.h"
+#include "dbhelpers.h"
+
+namespace mongo {
+
+    template< class V >
+    void BtreeBucket<V>::a_test(IndexDetails& id) {
+        BtreeBucket *b = id.head.btreemod<V>();
+
+        // record locs for testing
+        DiskLoc A(1, 20);
+        DiskLoc B(1, 30);
+        DiskLoc C(1, 40);
+
+        DiskLoc rl;
+        BSONObj key = fromjson("{x:9}");
+        BSONObj orderObj = fromjson("{}");
+        Ordering order = Ordering::make(orderObj);
+
+        b->bt_insert(id.head, A, key, order, true, id);
+        A.GETOFS() += 2;
+        b->bt_insert(id.head, A, key, order, true, id);
+        A.GETOFS() += 2;
+        b->bt_insert(id.head, A, key, order, true, id);
+        A.GETOFS() += 2;
+        b->bt_insert(id.head, A, key, order, true, id);
+        A.GETOFS() += 2;
+        assert( b->k(0).isUsed() );
+//        b->k(0).setUnused();
+        b->k(1).setUnused();
+        b->k(2).setUnused();
+        b->k(3).setUnused();
+
+        b->dumpTree(id.head, orderObj);
+
+        /* b->bt_insert(id.head, B, key, order, false, id);
+        b->k(1).setUnused();
+        b->dumpTree(id.head, order);
+        b->bt_insert(id.head, A, key, order, false, id);
+        b->dumpTree(id.head, order);
+        */
+
+        // this should assert.  does it? (it might "accidentally" though, not asserting proves a problem, asserting proves nothing)
+        b->bt_insert(id.head, C, key, order, false, id);
+
+        // b->dumpTree(id.head, order);
+    }
+
+    template class BucketBasics<V0>;
+    template class BucketBasics<V1>;
+    template class BtreeBucket<V0>;
+    template class BtreeBucket<V1>;
+    template struct __KeyNode<DiskLoc>;
+    template struct __KeyNode<DiskLoc56Bit>;
+
+    struct BTUnitTest : public UnitTest {
+        void run() {
+            DiskLoc big(0xf12312, 0x70001234);
+            DiskLoc56Bit bigl;
+            {
+                bigl = big;
+                assert( big == bigl );
+                DiskLoc e = bigl;
+                assert( big == e );
+            }
+            {
+                DiskLoc d;
+                assert( d.isNull() );
+                DiskLoc56Bit l;
+                l = d;
+                assert( l.isNull() );
+                d = l;
+                assert( d.isNull() );
+                assert( l < bigl );
+            }
+        }
+    } btunittest;
+
+}
diff --git a/src/mongo/db/btree.h b/src/mongo/db/btree.h
new file mode 100644
index 00000000000..85e5172d163
--- /dev/null
+++ b/src/mongo/db/btree.h
@@ -0,0 +1,1174 @@
+// btree.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+#include "diskloc.h"
+#include "pdfile.h"
+#include "key.h"
+
+namespace mongo {
+
+    /**
+     * Our btree implementation generally follows the standard btree algorithm,
+     * which is described in many places.  The nodes of our btree are referred to
+     * as buckets below.  These buckets are of size BucketSize and their body is
+     * an ordered array of <bson key, disk loc> pairs, where disk loc is the disk
+     * location of a document and bson key is a projection of this document into
+     * the schema of the index for this btree.  Ordering is determined on the
+     * basis of bson key first and then disk loc in case of a tie.  All bson keys
+     * for a btree have identical schemas with empty string field names and may
+     * not have an objsize() exceeding KeyMax.  The btree's buckets are
+     * themselves organized into an ordered tree.  Although there are exceptions,
+     * generally buckets with n keys have n+1 children and the body of a bucket is
+     * at least lowWaterMark bytes.  A more strictly enforced requirement is that
+     * a non root bucket must have at least one key except in certain transient
+     * states.
+     *
+     * Our btrees support the following primary read operations: finding a
+     * specified key; iterating from a starting key to the next or previous
+     * ordered key; and skipping from a starting key to another specified key
+     * without checking every intermediate key.  The primary write operations
+     * are insertion and deletion of keys.  Insertion may trigger a bucket split
+     * if necessary to avoid bucket overflow.  In such a case, subsequent splits
+     * will occur recursively as necessary.  Deletion may trigger a bucket
+     * rebalance, in which a size deficient bucket is filled with keys from an
+     * adjacent bucket.  In this case, splitting may potentially occur in the
+     * parent.  Deletion may alternatively trigger a merge, in which the keys
+     * from two buckets and a key from their shared parent are combined into the
+     * same bucket.  In such a case, rebalancing or merging may proceed
+     * recursively from the parent.
+     *
+     * While the btree data format has been relatively constant over time, btrees
+     * initially created by versions of mongo earlier than the current version
+     * may embody different properties than freshly created btrees (while
+     * following the same data format).  These older btrees are referred to
+     * below as legacy btrees.
+     */
+    
+    const int OldBucketSize = 8192;
+
+#pragma pack(1)
+    template< class Version > class BucketBasics;
+
+    /**
+     * This is the fixed width data component for storage of a key within a
+     * bucket.  It contains an offset pointer to the variable width bson
+     * data component.  A _KeyNode may be 'unused', please see below.
+     */
+    template< class Loc >
+    struct __KeyNode {
+        /** Signals that we are writing this _KeyNode and casts away const */
+        __KeyNode<Loc> & writing() const;
+        /**
+         * The 'left' child bucket of this key.  If this is the i-th key, it
+         * points to the i index child bucket.
+         */
+        Loc prevChildBucket;
+        /** The location of the record associated with this key. */
+        Loc recordLoc;
+        short keyDataOfs() const { return (short) _kdo; }
+
+        /** Offset within current bucket of the variable width bson key for this _KeyNode. */
+        unsigned short _kdo;
+        void setKeyDataOfs(short s) {
+            _kdo = s;
+            assert(s>=0);
+        }
+        /** Seems to be redundant. */
+        void setKeyDataOfsSavingUse(short s) {
+            _kdo = s;
+            assert(s>=0);
+        }
+        /**
+         * Unused keys are not returned by read operations.  Keys may be marked
+         * as unused in cases where it is difficult to delete them while
+         * maintaining the constraints required of a btree.
+         *
+         * Setting ofs to odd is the sentinel for unused, as real recordLoc's
+         * are always even numbers.  Note we need to keep its value basically
+         * the same as we use the recordLoc as part of the key in the index
+         * (to handle duplicate keys efficiently).
+         *
+         * Flagging keys as unused is a feature that is being phased out in favor
+         * of deleting the keys outright.  The current btree implementation is
+         * not expected to mark a key as unused in a non legacy btree.
+         */
+        void setUnused() {
+            recordLoc.GETOFS() |= 1;
+        }
+        void setUsed() { recordLoc.GETOFS() &= ~1; }
+        int isUnused() const {
+            return recordLoc.getOfs() & 1;
+        }
+        int isUsed() const {
+            return !isUnused();
+        }
+    };
+
+    /**
+     * This structure represents header data for a btree bucket.  An object of
+     * this type is typically allocated inside of a buffer of size BucketSize,
+     * resulting in a full bucket with an appropriate header.
+     *
+     * The body of a btree bucket contains an array of _KeyNode objects starting
+     * from its lowest indexed bytes and growing to higher indexed bytes.  The
+     * body also contains variable width bson keys, which are allocated from the
+     * highest indexed bytes toward lower indexed bytes.
+     *
+     * |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb|
+     * h = header data
+     * k = KeyNode data
+     * - = empty space
+     * b = bson key data
+     * u = unused (old) bson key data, that may be garbage collected
+     */
+    class BtreeData_V0 {
+    protected:
+        /** Parent bucket of this bucket, which isNull() for the root bucket. */
+        DiskLoc parent;
+        /** Given that there are n keys, this is the n index child. */
+        DiskLoc nextChild;
+        /** can be reused, value is 8192 in current pdfile version Apr2010 */
+        unsigned short _wasSize;
+        /** zero */
+        unsigned short _reserved1;
+        int flags;
+
+        void _init() {
+            _reserved1 = 0;
+            _wasSize = BucketSize;
+            reserved = 0;
+        }
+
+        /** basicInsert() assumes the next three members are consecutive and in this order: */
+
+        /** Size of the empty region. */
+        int emptySize;
+        /** Size used for bson storage, including storage of old keys. */
+        int topSize;
+        /* Number of keys in the bucket. */
+        int n;
+
+        int reserved;
+        /* Beginning of the bucket's body */
+        char data[4];
+
+    public:
+        typedef __KeyNode<DiskLoc> _KeyNode;
+        typedef DiskLoc Loc;
+        typedef KeyBson Key;
+        typedef KeyBson KeyOwned;
+        enum { BucketSize = 8192 };
+
+        // largest key size we allow.  note we very much need to support bigger keys (somehow) in the future.
+        static const int KeyMax = OldBucketSize / 10;
+    };
+
+    // a a a ofs ofs ofs ofs
+    class DiskLoc56Bit {
+        int ofs;
+        unsigned char _a[3];
+        unsigned long long Z() const { 
+            // endian
+            return *((unsigned long long*)this) & 0x00ffffffffffffffULL;
+        }
+        enum { 
+            // first bit of offsets used in _KeyNode we don't use -1 here.
+            OurNullOfs = -2
+        };
+    public:
+        template< class V >
+        const BtreeBucket<V> * btree() const { 
+            return DiskLoc(*this).btree<V>();
+        }
+        template< class V >
+        BtreeBucket<V> * btreemod() const { 
+            return DiskLoc(*this).btreemod<V>();
+        }
+        operator const DiskLoc() const { 
+            // endian
+            if( isNull() ) return DiskLoc();
+            unsigned a = *((unsigned *) (_a-1));
+            return DiskLoc(a >> 8, ofs);
+        }
+        int& GETOFS()      { return ofs; }
+        int getOfs() const { return ofs; }
+        bool operator<(const DiskLoc56Bit& rhs) const {
+            // the orderering of dup keys in btrees isn't too critical, but we'd like to put items that are 
+            // close together on disk close together in the tree, so we do want the file # to be the most significant
+            // bytes
+            return Z() < rhs.Z();
+        }
+        int compare(const DiskLoc56Bit& rhs) const {
+            unsigned long long a = Z();
+            unsigned long long b = rhs.Z();
+            if( a < b ) return -1;
+            return a == b ? 0 : 1;
+        }
+        bool operator==(const DiskLoc56Bit& rhs) const { return Z() == rhs.Z(); }
+        bool operator!=(const DiskLoc56Bit& rhs) const { return Z() != rhs.Z(); }
+        bool operator==(const DiskLoc& rhs) const {
+            return DiskLoc(*this) == rhs;
+        }
+        bool operator!=(const DiskLoc& rhs) const { return !(*this==rhs); }
+        bool isNull() const { return ofs < 0; }
+        void Null() { 
+            ofs = OurNullOfs; 
+            _a[0] = _a[1] = _a[2] = 0;
+        }
+        string toString() const { return DiskLoc(*this).toString(); }
+        void operator=(const DiskLoc& loc) {
+            ofs = loc.getOfs();
+            int la = loc.a();
+            assert( la <= 0xffffff ); // must fit in 3 bytes
+            if( la < 0 ) {
+                assert( la == -1 );
+                la = 0;
+                ofs = OurNullOfs;
+            }
+            memcpy(_a, &la, 3); // endian
+            dassert( ofs != 0 );
+        }
+        DiskLoc56Bit& writing() const { 
+            return *((DiskLoc56Bit*) getDur().writingPtr((void*)this, 7));
+        }
+    };
+
+    class BtreeData_V1 {
+    public:
+        typedef DiskLoc56Bit Loc;
+        //typedef DiskLoc Loc;
+        typedef __KeyNode<Loc> _KeyNode;
+        typedef KeyV1 Key;
+        typedef KeyV1Owned KeyOwned;
+        enum { BucketSize = 8192-16 }; // leave room for Record header
+        // largest key size we allow.  note we very much need to support bigger keys (somehow) in the future.
+        static const int KeyMax = 1024;
+    protected:
+        /** Parent bucket of this bucket, which isNull() for the root bucket. */
+        Loc parent;
+        /** Given that there are n keys, this is the n index child. */
+        Loc nextChild;
+
+        unsigned short flags;
+
+        /** basicInsert() assumes the next three members are consecutive and in this order: */
+
+        /** Size of the empty region. */
+        unsigned short emptySize;
+        /** Size used for bson storage, including storage of old keys. */
+        unsigned short topSize;
+        /* Number of keys in the bucket. */
+        unsigned short n;
+
+        /* Beginning of the bucket's body */
+        char data[4];
+
+        void _init() { }
+    };
+
+    typedef BtreeData_V0 V0;
+    typedef BtreeData_V1 V1;
+
+    /**
+     * This class adds functionality to BtreeData for managing a single bucket.
+     * The following policies are used in an attempt to encourage simplicity:
+     *
+     * Const member functions of this class are those which may be called on
+     * an object for which writing has not been signaled.  Non const member
+     * functions may only be called on objects for which writing has been
+     * signaled.  Note that currently some const functions write to the
+     * underlying memory representation of this bucket using optimized methods
+     * to signal write operations.
+     *
+     * DiskLoc parameters that may shadow references within the btree should
+     * be passed by value rather than by reference to non const member
+     * functions or to const member functions which may perform writes.  This way
+     * a callee need not worry that write operations will change or invalidate
+     * its arguments.
+     *
+     * The current policy for dealing with bson arguments is the opposite of
+     * what is described above for DiskLoc arguments.  We do not want to copy
+     * bson into memory as an intermediate step for btree changes, and if bson
+     * is to be moved it must be copied to the new location before the old
+     * location is invalidated.  Care should be taken in cases where that invalid
+     * memory may be implicitly referenced by function arguments.
+     *
+     * A number of functions below require a thisLoc argument, which must be the
+     * disk location of the bucket mapped to 'this'.
+     */
+    template< class Version >
+    class BucketBasics : public Version {
+    public:
+        template <class U> friend class BtreeBuilder;
+        typedef typename Version::Key Key;
+        typedef typename Version::_KeyNode _KeyNode;
+        typedef typename Version::Loc Loc;
+
+        int getN() const { return this->n; }
+
+        /**
+         * This is an in memory wrapper for a _KeyNode, and not itself part of btree
+         * storage.  This object and its BSONObj 'key' will become invalid if the
+         * _KeyNode data that generated it is moved within the btree.  In general,
+         * a KeyNode should not be expected to be valid after a write.
+         */
+        class KeyNode {
+        public:
+            KeyNode(const BucketBasics<Version>& bb, const _KeyNode &k);
+            const Loc& prevChildBucket;
+            const Loc& recordLoc;
+            /* Points to the bson key storage for a _KeyNode */
+            Key key;
+        };
+        friend class KeyNode;
+
+        /** Assert write intent declared for this bucket already. */
+        void assertWritable();
+
+        void assertValid(const Ordering &order, bool force = false) const;
+        void assertValid(const BSONObj &orderObj, bool force = false) const { return assertValid(Ordering::make(orderObj),force); }
+
+        /**
+         * @return KeyNode for key at index i.  The KeyNode will become invalid
+         * if the key is moved or reassigned, or if the node is packed.  In general
+         * a KeyNode should not be expected to be valid after a write.
+         */
+        const KeyNode keyNode(int i) const {
+            if ( i >= this->n ) {
+                massert( 13000 , (string)"invalid keyNode: " +  BSON( "i" << i << "n" << this->n ).jsonString() , i < this->n );
+            }
+            return KeyNode(*this, k(i));
+        }
+
+        static int headerSize() {
+            const BucketBasics *d = 0;
+            return (char*)&(d->data) - (char*)&(d->parent);
+        }
+        static int bodySize() { return Version::BucketSize - headerSize(); }
+        static int lowWaterMark() { return bodySize() / 2 - Version::KeyMax - sizeof( _KeyNode ) + 1; } // see comment in btree.cpp
+
+        // for testing
+        int nKeys() const { return this->n; }
+        const DiskLoc getNextChild() const { return this->nextChild; }
+
+    protected:
+        char * dataAt(short ofs) { return this->data + ofs; }
+
+        /** Initialize the header for a new node. */
+        void init();
+
+        /**
+         * Preconditions:
+         *  - 0 <= keypos <= n
+         *  - If key is inserted at position keypos, the bucket's keys will still be
+         *    in order.
+         * Postconditions:
+         *  - If key can fit in the bucket, the bucket may be packed and keypos
+         *    may be decreased to reflect deletion of earlier indexed keys during
+         *    packing, the key will be inserted at the updated keypos index with
+         *    a null prevChildBucket, the subsequent keys shifted to the right,
+         *    and the function will return true.
+         *  - If key cannot fit in the bucket, the bucket will be packed and
+         *    the function will return false.
+         * Although this function is marked const, it modifies the underlying
+         * btree representation through an optimized write intent mechanism.
+         */
+        bool basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const;
+
+        /**
+         * Preconditions:
+         *  - key / recordLoc are > all existing keys
+         *  - The keys in prevChild and their descendents are between all existing
+         *    keys and 'key'.
+         * Postconditions:
+         *  - If there is space for key without packing, it is inserted as the
+         *    last key with specified prevChild and true is returned.
+         *    Importantly, nextChild is not updated!
+         *  - Otherwise false is returned and there is no change.
+         */
+        bool _pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild);
+        void pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) {
+            bool ok = _pushBack( recordLoc , key , order , prevChild );
+            assert(ok);
+        }
+
+        /**
+         * This is a special purpose function used by BtreeBuilder.  The
+         * interface is quite dangerous if you're not careful.  The bson key
+         * returned here points to bucket memory that has been invalidated but
+         * not yet reclaimed.
+         *
+         * TODO Maybe this could be replaced with two functions, one which
+         * returns the last key without deleting it and another which simply
+         * deletes the last key.  Then the caller would have enough control to
+         * ensure proper memory integrity.
+         *
+         * Preconditions:
+         *  - bucket is not empty
+         *  - last key of bucket is used (not unused)
+         *  - nextChild isNull()
+         *  - _unalloc will work correctly as used - see code
+         * Postconditions:
+         *  - The last key of the bucket is removed, and its key and recLoc are
+         *    returned.  As mentioned above, the key points to unallocated memory.
+         */
+        void popBack(DiskLoc& recLoc, Key &key);
+
+        /**
+         * Preconditions:
+         *  - 0 <= keypos < n
+         *  - there is no child bucket at keypos
+         *  - n > 1
+         *  - if mayEmpty == false or nextChild.isNull(), n > 0
+         * Postconditions:
+         *  - The key at keypos is removed, and remaining keys are shifted over.
+         *  - The bucket becomes unpacked.
+         *  - if mayEmpty is true and nextChild.isNull(), the bucket may have no keys.
+         */
+        void _delKeyAtPos(int keypos, bool mayEmpty = false);
+
+        /* !Packed means there is deleted fragment space within the bucket.
+           We "repack" when we run out of space before considering the node
+           to be full.
+           */
+        enum Flags { Packed=1 };
+
+        /** n == 0 is ok */
+        const Loc& childForPos(int p) const { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
+        Loc& childForPos(int p) { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
+
+        /** Same as bodySize(). */
+        int totalDataSize() const;
+        /**
+         * @return true when a key may be dropped by pack()
+         * @param index index of the key that may be dropped
+         * @param refPos index of a particular key of interest, which must not
+         *  be dropped; = 0 to safely ignore
+         */
+        bool mayDropKey( int index, int refPos ) const;
+
+        /**
+         * Pack the bucket to reclaim space from invalidated memory.
+         * @refPos is an index in the bucket which may be updated if we
+         *  delete keys from the bucket
+         * This function may cast away const and perform a write.
+         * Preconditions: none
+         * Postconditions:
+         *  - Bucket will be packed
+         *  - Some unused nodes may be dropped, but not ones at index 0 or refPos
+         *  - Some used nodes may be moved
+         *  - If refPos is the index of an existing key, it will be updated to that
+         *    key's new index if the key is moved.
+         */
+        void _pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const;
+        /** Pack when already writable */
+        void _packReadyForMod(const Ordering &order, int &refPos);
+
+        /** @return the size the bucket's body would have if we were to call pack() */
+        int packedDataSize( int refPos ) const;
+        void setNotPacked() { this->flags &= ~Packed; }
+        void setPacked() { this->flags |= Packed; }
+        /**
+         * Preconditions: 'bytes' is <= emptySize
+         * Postconditions: A buffer of size 'bytes' is allocated on the top side,
+         *  and its offset is returned.
+         */
+        int _alloc(int bytes);
+        /**
+         * This function can be used to deallocate the lowest byte index bson
+         * buffer in the top region, which in some but not all cases is for the
+         * n - 1 index key.  This function only works correctly in certain
+         * special cases, please be careful.
+         * Preconditions: 'bytes' <= topSize
+         * Postconditions: The top region is decreased
+         */
+        void _unalloc(int bytes);
+        /**
+         * Preconditions: 'N' <= n
+         * Postconditions:
+         *  - All keys after the N index key are dropped.
+         *  - Then bucket is packed, without dropping refPos if < refPos N.
+         */
+        void truncateTo(int N, const Ordering &order, int &refPos);
+        /**
+         * Preconditions:
+         *  - 'nDrop' < n
+         *  - for now, refPos should be zero.
+         * Postconditions:
+         *  - All keys before the nDrop index key are dropped.
+         *  - The bucket is packed.
+         */
+        void dropFront(int nDrop, const Ordering &order, int &refPos);
+        /**
+         * Preconditions: 0 <= keypos < n
+         * Postconditions: keypos indexed key is marked unused.
+         */
+        void markUnused(int keypos);
+
+        /**
+         * BtreeBuilder uses the parent var as a temp place to maintain a linked list chain.
+         *   we use tempNext() when we do that to be less confusing. (one might have written a union in C)
+         */
+        DiskLoc tempNext() const { return this->parent; }
+        void setTempNext(DiskLoc l) { this->parent = l; }
+
+        void _shape(int level, stringstream&) const;
+        int Size() const;
+        
+        /** @return i-indexed _KeyNode, without bounds checking */
+    public:
+        const _KeyNode& k(int i) const { return ((const _KeyNode*)this->data)[i]; }
+        _KeyNode& _k(int i) { return ((_KeyNode*)this->data)[i]; }
+    protected:        
+        _KeyNode& k(int i) { return ((_KeyNode*)this->data)[i]; }
+
+        /**
+         * Preconditions: 'this' is packed
+         * @return the key index to be promoted on split
+         * @param keypos The requested index of a key to insert, which may affect
+         *  the choice of split position.
+         */
+        int splitPos( int keypos ) const;
+
+        /**
+         * Preconditions: nAdd * sizeof( _KeyNode ) <= emptySize
+         * Postconditions:
+         *  - Increases indexes of existing _KeyNode objects by nAdd, reserving
+         *    space for additional _KeyNode objects at front.
+         *  - Does not initialize ofs values for the bson data of these
+         *    _KeyNode objects.
+         */
+        void reserveKeysFront( int nAdd );
+
+        /**
+         * Preconditions:
+         *  - 0 <= i < n
+         *  - The bson 'key' must fit in the bucket without packing.
+         *  - If 'key' and 'prevChildBucket' are set at index i, the btree
+         *    ordering properties will be maintained.
+         * Postconditions:
+         *  - The specified key is set at index i, replacing the existing
+         *    _KeyNode data and without shifting any other _KeyNode objects.
+         */
+        void setKey( int i, const DiskLoc recordLoc, const Key& key, const DiskLoc prevChildBucket );
+    };
+
+    template< class V>
+    struct Continuation;
+
+    /**
+     * This class adds functionality for manipulating buckets that are assembled
+     * in a tree.  The requirements for const and non const functions and
+     * arguments are generally the same as in BtreeBucket.  Because this class
+     * deals with tree structure, some functions that are marked const may
+     * trigger modification of another node in the btree or potentially of the
+     * current node.  In such cases, the function's implementation explicitly
+     * casts away const when indicating an intent to write to the durability
+     * layer.  The DiskLocs provided to such functions should be passed by
+     * value if they shadow pointers within the btree.
+     *
+     * To clarify enforcement of referential integrity in this implementation,
+     * we use the following pattern when deleting data we have a persistent
+     * pointer to.  The pointer is cleared or removed explicitly, then the data
+     * it pointed to is cleaned up with a helper function.
+     *
+     * TODO It might make sense to put some of these functions in a class
+     * representing a full btree instead of a single btree bucket.  That would
+     * allow us to use the const qualifier in a manner more consistent with
+     * standard usage.  Right now the interface is for both a node and a tree,
+     * so assignment of const is sometimes nonideal.
+     *
+     * TODO There are several cases in which the 'this' pointer is invalidated
+     * as a result of deallocation.  A seperate class representing a btree would
+     * alleviate some fragile cases where the implementation must currently
+     * behave correctly if the 'this' pointer is suddenly invalidated by a
+     * callee.
+     */
+    template< class V >
+    class BtreeBucket : public BucketBasics<V> {
+        friend class BtreeCursor;
+        friend struct Continuation<V>;
+    public:
+	// make compiler happy:
+        typedef typename V::Key Key;
+        typedef typename V::KeyOwned KeyOwned;
+	typedef typename BucketBasics<V>::KeyNode KeyNode;
+	typedef typename BucketBasics<V>::_KeyNode _KeyNode;
+	typedef typename BucketBasics<V>::Loc Loc;
+        const _KeyNode& k(int i) const     { return static_cast< const BucketBasics<V> * >(this)->k(i); }
+    protected:
+        _KeyNode& k(int i)                 { return static_cast< BucketBasics<V> * >(this)->_k(i); }
+    public:
+        const KeyNode keyNode(int i) const { return static_cast< const BucketBasics<V> * >(this)->keyNode(i); }
+
+        bool isHead() const { return this->parent.isNull(); }
+        void dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const;
+        long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount = 0, bool strict = false, unsigned depth=0) const; /* traverses everything */
+
+        bool isUsed( int i ) const { return this->k(i).isUsed(); }
+        string bucketSummary() const;
+        void dump(unsigned depth=0) const;
+
+        /**
+         * @return true if key exists in index
+         *
+         * @order - indicates order of keys in the index.  this is basically the index's key pattern, e.g.:
+         *    BSONObj order = ((IndexDetails&)idx).keyPattern();
+         * likewise below in bt_insert() etc.
+         */
+    private:
+        bool exists(const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order) const;
+    public:
+
+        /**
+         * @param self - Don't complain about ourself already being in the index case.
+         * @return true = There is a duplicate used key.
+         */
+        bool wouldCreateDup(
+            const IndexDetails& idx, const DiskLoc &thisLoc,
+            const Key& key, const Ordering& order,
+            const DiskLoc &self) const;
+
+        /**
+         * Preconditions: none
+         * Postconditions: @return a new bucket allocated from pdfile storage
+         *  and init()-ed.  This bucket is suitable to for use as a new root
+         *  or any other new node in the tree.
+         */
+        static DiskLoc addBucket(const IndexDetails&);
+
+        /**
+         * Preconditions: none
+         * Postconditions:
+         *  - Some header values in this bucket are cleared, and the bucket is
+         *    deallocated from pdfile storage.
+         *  - The memory at thisLoc is invalidated, and 'this' is invalidated.
+         */
+        void deallocBucket(const DiskLoc thisLoc, const IndexDetails &id);
+
+        /**
+         * Preconditions:
+         *  - 'key' has a valid schema for this index.
+         *  - All other paramenters are valid and consistent with this index if applicable.
+         * Postconditions:
+         *  - If key is bigger than KeyMax, @return 2 or 3 and no change.
+         *  - If key / recordLoc exist in the btree as an unused key, set them
+         *    as used and @return 0
+         *  - If key / recordLoc exist in the btree as a used key, @throw
+         *    exception 10287 and no change.
+         *  - If key / recordLoc do not exist in the btree, they are inserted
+         *    and @return 0.  The root of the btree may be changed, so
+         *    'this'/thisLoc may no longer be the root upon return.
+         */
+        int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                      const BSONObj& key, const Ordering &order, bool dupsAllowed,
+                      IndexDetails& idx, bool toplevel = true) const;
+
+        /** does the insert in two steps - can then use an upgradable lock for step 1, which 
+            is the part which may have page faults.  also that step is most of the computational work.
+        */
+        void twoStepInsert(DiskLoc thisLoc, Continuation<V> &c, bool dupsAllowed) const;
+
+        /**
+         * Preconditions:
+         *  - 'key' has a valid schema for this index, and may have objsize() > KeyMax.
+         * Postconditions:
+         *  - If key / recordLoc are in the btree, they are removed (possibly
+         *    by being marked as an unused key), @return true, and potentially
+         *    invalidate 'this' / thisLoc and change the head.
+         *  - If key / recordLoc are not in the btree, @return false and do nothing.
+         */
+        bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const;
+
+        /**
+         * locate may return an "unused" key that is just a marker.  so be careful.
+         *   looks for a key:recordloc pair.
+         *
+         * @found - returns true if exact match found.  note you can get back a position
+         *          result even if found is false.
+         */
+        DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+                       int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;
+        DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const Key& key, const Ordering &order,
+                       int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;
+
+        /**
+         * find the first instance of the key
+         * does not handle dups
+         * WARNING: findSingle may not be compound index safe.  this may need to change.  see notes in 
+         *          findSingle code.
+         * @return the record location of the first match
+         */
+        DiskLoc findSingle( const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const;
+
+        /**
+         * Advance to next or previous key in the index.
+         * @param direction to advance.
+         */
+        DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const;
+
+        /** Advance in specified direction to the specified key */
+        void advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const;
+
+        /** Locate a key with fields comprised of a combination of keyBegin fields and keyEnd fields. */
+        static void customLocate(DiskLoc &locInOut, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) ;
+
+        /** @return head of the btree by traversing from current bucket. */
+        const DiskLoc getHead(const DiskLoc& thisLoc) const;
+
+        /** get tree shape */
+        void shape(stringstream&) const;
+
+        static void a_test(IndexDetails&);
+
+        static int getKeyMax();
+
+    protected:
+        /**
+         * Preconditions:
+         *  - 0 <= firstIndex <= n
+         *  - -1 <= lastIndex <= n ( -1 is equivalent to n )
+         * Postconditions:
+         *  - Any children at indexes firstIndex through lastIndex (inclusive)
+         *    will have their parent pointers set to thisLoc.
+         */
+        void fixParentPtrs(const DiskLoc thisLoc, int firstIndex = 0, int lastIndex = -1) const;
+
+        /**
+         * Preconditions:
+         *  - thisLoc is not the btree head.
+         *  - n == 0 is ok
+         * Postconditions:
+         *  - All cursors pointing to this bucket will be updated.
+         *  - This bucket's parent's child pointer is set to null.
+         *  - This bucket is deallocated from pdfile storage.
+         *  - 'this' and thisLoc are invalidated.
+         */
+        void delBucket(const DiskLoc thisLoc, const IndexDetails&);
+
+        /**
+         * Preconditions: 0 <= p < n
+         * Postconditions:
+         *  - The key at index p is removed from the btree.
+         *  - 'this' and thisLoc may be invalidated.
+         *  - The tree head may change.
+         */
+        void delKeyAtPos(const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order);
+
+        /**
+         * Preconditions:
+         *  - n == 0 is ok
+         * Postconditions:
+         *  - If thisLoc is head, or if its body has at least lowWaterMark bytes,
+         *    return false and do nothing.
+         *  - Otherwise, if thisLoc has left or right neighbors, either balance
+         *    or merge with them and return true.  Also, 'this' and thisLoc may
+         *    be invalidated and the tree head may change.
+         */
+        bool mayBalanceWithNeighbors(const DiskLoc thisLoc, IndexDetails &id, const Ordering &order) const;
+
+        /**
+         * Preconditions:
+         *  - 0 <= leftIndex < n
+         *  - The child at leftIndex or the child at leftIndex + 1 contains
+         *    fewer than lowWaterMark bytes.
+         * Postconditions:
+         *  - If the child bucket at leftIndex can merge with the child index
+         *    at leftIndex + 1, do nothing and return false.
+         *  - Otherwise, balance keys between the leftIndex child and the
+         *    leftIndex + 1 child, return true, and possibly change the tree head.
+         */
+        bool tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const;
+
+        /**
+         * Preconditions:
+         *  - All preconditions of tryBalanceChildren.
+         *  - The leftIndex child and leftIndex + 1 child cannot be merged.
+         * Postconditions:
+         *  - Keys are moved between the leftIndex child and the leftIndex + 1
+         *    child such that neither child has fewer than lowWaterMark bytes.
+         *    The tree head may change.
+         */
+        void doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order );
+        
+        /**
+         * Preconditions:
+         *  - All preconditions of doBalanceChildren
+         *  - The leftIndex and leftIndex + 1 children are packed.
+         *  - The leftIndex + 1 child has fewer than lowWaterMark bytes.
+         *  - split returned by rebalancedSeparatorPos()
+         * Postconditions:
+         *  - The key in lchild at index split is set as thisLoc's key at index
+         *    leftIndex, which may trigger a split and change the tree head.
+         *    The previous key in thisLoc at index leftIndex and all keys with
+         *    indexes greater than split in lchild are moved to rchild.
+         */
+        void doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
+                                   BtreeBucket<V> *l, const DiskLoc lchild,
+                                   BtreeBucket<V> *r, const DiskLoc rchild,
+                                   IndexDetails &id, const Ordering &order );
+        /**
+         * Preconditions:
+         *  - All preconditions of doBalanceChildren
+         *  - The leftIndex and leftIndex + 1 children are packed.
+         *  - The leftIndex child has fewer than lowWaterMark bytes.
+         *  - split returned by rebalancedSeparatorPos()
+         * Postconditions:
+         *  - The key in rchild at index split - l->n - 1 is set as thisLoc's key
+         *    at index leftIndex, which may trigger a split and change the tree
+         *    head.  The previous key in thisLoc at index leftIndex and all keys
+         *    with indexes less than split - l->n - 1 in rchild are moved to
+         *    lchild.
+         */        
+        void doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
+                                   BtreeBucket<V> *l, const DiskLoc lchild,
+                                   BtreeBucket<V> *r, const DiskLoc rchild,
+                                   IndexDetails &id, const Ordering &order );
+
+        /**
+         * Preconditions:
+         *  - 0 <= leftIndex < n
+         *  - this->canMergeChildren( thisLoc, leftIndex ) == true
+         * Postconditions:
+         *  - All of the above mentioned keys will be placed in the left child.
+         *  - The tree may be updated recursively, resulting in 'this' and
+         *    thisLoc being invalidated and the tree head being changed.
+         */
+        void doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order);
+
+        /**
+         * Preconditions:
+         *  - n == 0
+         *  - !nextChild.isNull()
+         * Postconditions:
+         *  - 'this' and thisLoc are deallocated (and invalidated), any cursors
+         *    to them are updated, and the tree head may change.
+         *  - nextChild replaces thisLoc in the btree structure.
+         */
+        void replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id );
+
+        /**
+         * @return true iff the leftIndex and leftIndex + 1 children both exist,
+         *  and if their body sizes when packed and the thisLoc key at leftIndex
+         *  would fit in a single bucket body.
+         */
+        bool canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const;
+
+        /**
+         * Preconditions:
+         *  - leftIndex and leftIndex + 1 children are packed
+         *  - leftIndex or leftIndex + 1 child is below lowWaterMark
+         * @return index of the rebalanced separator; the index value is
+         *  determined as if we had a bucket with body
+         *  <left bucket keys array>.push( <old separator> ).concat( <right bucket keys array> )
+         *  and called splitPos( 0 ) on it.
+         */
+        int rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const;
+
+        /**
+         * Preconditions: thisLoc has a parent
+         * @return parent's index of thisLoc.
+         */
+        int indexInParent( const DiskLoc &thisLoc ) const;        
+
+    public:
+        Key keyAt(int i) const {
+            if( i >= this->n ) 
+                return Key();
+            return Key(this->data + k(i).keyDataOfs());
+        }
+    protected:
+
+        /**
+         * Preconditions:
+         *  - This bucket is packed.
+         *  - Cannot add a key of size KeyMax to this bucket.
+         *  - 0 <= keypos <= n is the position of a new key that will be inserted
+         *  - lchild is equal to the existing child at index keypos.
+         * Postconditions:
+         *  - The thisLoc bucket is split into two packed buckets, possibly
+         *    invalidating the initial position of keypos, with a split key
+         *    promoted to the parent.  The new key key/recordLoc will be inserted
+         *    into one of the split buckets, and lchild/rchild set appropriately.
+         *    Splitting may occur recursively, possibly changing the tree head.
+         */
+        void split(const DiskLoc thisLoc, int keypos,
+                   const DiskLoc recordLoc, const Key& key,
+                   const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx);
+
+        /**
+         * Preconditions:
+         *  - 0 <= keypos <= n
+         *  - If key / recordLoc are inserted at position keypos, with provided
+         *    lchild and rchild, the btree ordering requirements will be
+         *    maintained.
+         *  - lchild is equal to the existing child at index keypos.
+         *  - n == 0 is ok.
+         * Postconditions:
+         *  - The key / recordLoc are inserted at position keypos, and the
+         *    bucket is split if necessary, which may change the tree head.
+         *  - The bucket may be packed or split, invalidating the specified value
+         *    of keypos.
+         * This function will always modify thisLoc, but it's marked const because
+         * it commonly relies on the specialized writ]e intent mechanism of basicInsert().
+         */
+        void insertHere(const DiskLoc thisLoc, int keypos,
+                        const DiskLoc recordLoc, const Key& key, const Ordering &order,
+                        const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx) const;
+
+        /** bt_insert() is basically just a wrapper around this. */
+        int _insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                    const Key& key, const Ordering &order, bool dupsAllowed,
+                    const DiskLoc lChild, const DiskLoc rChild, IndexDetails &idx) const;
+
+        void insertStepOne(DiskLoc thisLoc, Continuation<V>& c, bool dupsAllowed) const;
+
+        bool find(const IndexDetails& idx, const Key& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const;        
+        static bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) ;
+        static void findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey);
+        static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction );
+        
+        /** If child is non null, set its parent to thisLoc */
+        static void fix(const DiskLoc thisLoc, const DiskLoc child);
+
+        /**
+         * Preconditions:
+         *  - 0 <= keypos < n
+         *  - If the specified key and recordLoc are placed in keypos of thisLoc,
+         *    and lchild and rchild are set, the btree ordering properties will
+         *    be maintained.
+         *  - rchild == childForPos( keypos + 1 )
+         *  - childForPos( keypos ) is referenced elsewhere if nonnull.
+         * Postconditions:
+         *  - The key at keypos will be replaced with the specified key and
+         *    lchild, potentially splitting this bucket and changing the tree
+         *    head.
+         *  - childForPos( keypos ) will be orphaned.
+         */
+        void setInternalKey( const DiskLoc thisLoc, int keypos,
+                             const DiskLoc recordLoc, const Key &key, const Ordering &order,
+                             const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx);
+
+        /**
+         * Preconditions:
+         *  - 0 <= keypos < n
+         *  - The keypos or keypos+1 indexed child is non null.
+         * Postconditions:
+         *  - The specified key is deleted by replacing it with another key if
+         *    possible.  This replacement may cause a split and change the tree
+         *    head.  The replacement key will be deleted from its original
+         *    location, potentially causing merges and splits that may invalidate
+         *    'this' and thisLoc and change the tree head.
+         *  - If the key cannot be replaced, it will be marked as unused.  This
+         *    is only expected in legacy btrees.
+         */
+        void deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order );
+    public:
+        /** simply builds and returns a dup key error message string */
+        static string dupKeyError( const IndexDetails& idx , const Key& key );
+    };
+#pragma pack()
+
+    class FieldRangeVector;
+    class FieldRangeVectorIterator;
+    
+    class BtreeCursor : public Cursor {
+    protected:
+        BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
+        BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
+    public:
+        virtual ~BtreeCursor();
+        /** makes an appropriate subclass depending on the index version */
+        static BtreeCursor* make( NamespaceDetails *_d, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
+        static BtreeCursor* make( NamespaceDetails *_d, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
+        static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
+        static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
+
+        virtual bool ok() { return !bucket.isNull(); }
+        virtual bool advance();
+        virtual void noteLocation(); // updates keyAtKeyOfs...
+        virtual void checkLocation() = 0;
+        virtual bool supportGetMore() { return true; }
+        virtual bool supportYields() { return true; }
+
+        /**
+         * used for multikey index traversal to avoid sending back dups. see Matcher::matches().
+         * if a multikey index traversal:
+         *   if loc has already been sent, returns true.
+         *   otherwise, marks loc as sent.
+         * @return false if the loc has not been seen
+         */
+        virtual bool getsetdup(DiskLoc loc) {
+            if( _multikey ) {
+                pair<set<DiskLoc>::iterator, bool> p = _dups.insert(loc);
+                return !p.second;
+            }
+            return false;
+        }
+
+        virtual bool modifiedKeys() const { return _multikey; }
+        virtual bool isMultiKey() const { return _multikey; }
+
+        /*const _KeyNode& _currKeyNode() const {
+            assert( !bucket.isNull() );
+            const _KeyNode& kn = keyNode(keyOfs);
+            assert( kn.isUsed() );
+            return kn;
+        }*/
+
+        /** returns BSONObj() if ofs is out of range */
+        virtual BSONObj keyAt(int ofs) const = 0;
+
+        virtual BSONObj currKey() const = 0;
+        virtual BSONObj indexKeyPattern() { return indexDetails.keyPattern(); }
+
+        virtual void aboutToDeleteBucket(const DiskLoc& b) {
+            if ( bucket == b )
+                keyOfs = -1;
+        }
+
+        virtual DiskLoc currLoc() = 0; //  { return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc();  }
+        virtual DiskLoc refLoc()   { return currLoc(); }
+        virtual Record* _current() { return currLoc().rec(); }
+        virtual BSONObj current()  { return BSONObj(_current()); }
+        virtual string toString();
+
+        BSONObj prettyKey( const BSONObj &key ) const {
+            return key.replaceFieldNames( indexDetails.keyPattern() ).clientReadable();
+        }
+
+        virtual BSONObj prettyIndexBounds() const;
+
+        virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; }
+
+        virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher;  }
+
+        virtual long long nscanned() { return _nscanned; }
+
+        /** for debugging only */
+        const DiskLoc getBucket() const { return bucket; }
+        int getKeyOfs() const { return keyOfs; }
+
+        // just for unit tests
+        virtual bool curKeyHasChild() = 0;
+
+    protected:
+        /**
+         * Our btrees may (rarely) have "unused" keys when items are deleted.
+         * Skip past them.
+         */
+        virtual bool skipUnusedKeys() = 0;
+
+        bool skipOutOfRangeKeysAndCheckEnd();
+        void skipAndCheck();
+        void checkEnd();
+
+        /** selective audits on construction */
+        void audit();
+
+        virtual void _audit() = 0;
+        virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc) = 0;
+        virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) = 0;
+        virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) = 0;
+
+        /** set initial bucket */
+        void initWithoutIndependentFieldRanges();
+
+        /** if afterKey is true, we want the first key with values of the keyBegin fields greater than keyBegin */
+        void advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive );
+
+        set<DiskLoc> _dups;
+        NamespaceDetails * const d;
+        const int idxNo;
+        BSONObj startKey;
+        BSONObj endKey;
+        bool _endKeyInclusive;
+        bool _multikey; // this must be updated every getmore batch in case someone added a multikey
+        const IndexDetails& indexDetails;
+        const BSONObj _order;
+        const Ordering _ordering;
+        DiskLoc bucket;
+        int keyOfs;
+        const int _direction; // 1=fwd,-1=reverse
+        BSONObj keyAtKeyOfs; // so we can tell if things moved around on us between the query and the getMore call
+        DiskLoc locAtKeyOfs;
+        const shared_ptr< FieldRangeVector > _bounds;
+        auto_ptr< FieldRangeVectorIterator > _boundsIterator;
+        shared_ptr< CoveredIndexMatcher > _matcher;
+        bool _independentFieldRanges;
+        long long _nscanned;
+    };
+
+    template< class V >
+    struct Continuation { 
+        //Continuation(const typename V::Key & k);
+        Continuation(DiskLoc thisLoc, DiskLoc _recordLoc, const BSONObj &_key,
+                     Ordering _order, IndexDetails& _idx) :
+            bLoc(thisLoc), recordLoc(_recordLoc), key(_key), order(_order), idx(_idx) { 
+            op = Nothing;
+        }
+
+        DiskLoc bLoc;
+        DiskLoc recordLoc;
+        typename V::KeyOwned key;
+        const Ordering order;
+        IndexDetails& idx;
+        enum Op { Nothing, SetUsed, InsertHere } op;
+
+        int pos;
+        const BtreeBucket<V> *b;
+
+        void stepTwo() {
+            if( op == Nothing )
+                return;
+            else if( op == SetUsed ) {
+                const typename V::_KeyNode& kn = b->k(pos);
+                kn.writing().setUsed();
+            }
+            else {
+                b->insertHere(bLoc, pos, recordLoc, key, order, DiskLoc(), DiskLoc(), idx);
+            }
+        }
+    };
+
+    /** Renames the index namespace for this btree's index. */
+    void renameIndexNamespace(const char *oldNs, const char *newNs);
+
+    /**
+     * give us a writable version of the btree bucket (declares write intent).
+     * note it is likely more efficient to declare write intent on something smaller when you can.
+     */
+    template< class V >
+    BtreeBucket<V> * DiskLoc::btreemod() const {
+        assert( _a != -1 );
+        BtreeBucket<V> *b = const_cast< BtreeBucket<V> * >( btree<V>() );
+        return static_cast< BtreeBucket<V>* >( getDur().writingPtr( b, V::BucketSize ) );
+    }
+
+    template< class V >
+    BucketBasics<V>::KeyNode::KeyNode(const BucketBasics<V>& bb, const _KeyNode &k) :
+        prevChildBucket(k.prevChildBucket),
+        recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs())
+    { }
+
+} // namespace mongo;
diff --git a/src/mongo/db/btreebuilder.cpp b/src/mongo/db/btreebuilder.cpp
new file mode 100644
index 00000000000..0ec587a1958
--- /dev/null
+++ b/src/mongo/db/btreebuilder.cpp
@@ -0,0 +1,184 @@
+// btreebuilder.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "json.h"
+#include "clientcursor.h"
+#include "client.h"
+#include "dbhelpers.h"
+#include "curop-inl.h"
+#include "stats/counters.h"
+#include "dur_commitjob.h"
+#include "btreebuilder.h"
+
+namespace mongo {
+
+    /* --- BtreeBuilder --- */
+
+    template<class V>
+    BtreeBuilder<V>::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) :
+        dupsAllowed(_dupsAllowed),
+        idx(_idx),
+        n(0),
+        order( idx.keyPattern() ),
+        ordering( Ordering::make(idx.keyPattern()) ) {
+        first = cur = BtreeBucket<V>::addBucket(idx);
+        b = cur.btreemod<V>();
+        committed = false;
+    }
+
+    template<class V>
+    void BtreeBuilder<V>::newBucket() {
+        DiskLoc L = BtreeBucket<V>::addBucket(idx);
+        b->setTempNext(L);
+        cur = L;
+        b = cur.btreemod<V>();
+    }
+
+    template<class V>
+    void BtreeBuilder<V>::mayCommitProgressDurably() {
+        if ( getDur().commitIfNeeded() ) {
+            b = cur.btreemod<V>();
+        }
+    }
+
+    template<class V>
+    void BtreeBuilder<V>::addKey(BSONObj& _key, DiskLoc loc) {
+
+        auto_ptr< KeyOwned > key( new KeyOwned(_key) );
+        if ( key->dataSize() > BtreeBucket<V>::KeyMax ) {
+            problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() 
+                      << ' ' << key->dataSize() << ' ' << key->toString() << endl;
+            return;
+        }
+
+        if( !dupsAllowed ) {
+            if( n > 0 ) {
+                int cmp = keyLast->woCompare(*key, ordering);
+                massert( 10288 ,  "bad key order in BtreeBuilder - server internal error", cmp <= 0 );
+                if( cmp == 0 ) {
+                    //if( !dupsAllowed )
+                    uasserted( ASSERT_ID_DUPKEY , BtreeBucket<V>::dupKeyError( idx , *keyLast ) );
+                }
+            }
+        }
+
+        if ( ! b->_pushBack(loc, *key, ordering, DiskLoc()) ) {
+            // bucket was full
+            newBucket();
+            b->pushBack(loc, *key, ordering, DiskLoc());
+        }
+        keyLast = key;
+        n++;
+        mayCommitProgressDurably();
+    }
+
+    template<class V>
+    void BtreeBuilder<V>::buildNextLevel(DiskLoc loc) {
+        int levels = 1;
+        while( 1 ) {
+            if( loc.btree<V>()->tempNext().isNull() ) {
+                // only 1 bucket at this level. we are done.
+                getDur().writingDiskLoc(idx.head) = loc;
+                break;
+            }
+            levels++;
+
+            DiskLoc upLoc = BtreeBucket<V>::addBucket(idx);
+            DiskLoc upStart = upLoc;
+            BtreeBucket<V> *up = upLoc.btreemod<V>();
+
+            DiskLoc xloc = loc;
+            while( !xloc.isNull() ) {
+                if ( getDur().commitIfNeeded() ) {
+                    b = cur.btreemod<V>();
+                    up = upLoc.btreemod<V>();
+                }
+
+                BtreeBucket<V> *x = xloc.btreemod<V>();
+                Key k;
+                DiskLoc r;
+                x->popBack(r,k);
+                bool keepX = ( x->n != 0 );
+                DiskLoc keepLoc = keepX ? xloc : x->nextChild;
+
+                if ( ! up->_pushBack(r, k, ordering, keepLoc) ) {
+                    // current bucket full
+                    DiskLoc n = BtreeBucket<V>::addBucket(idx);
+                    up->setTempNext(n);
+                    upLoc = n;
+                    up = upLoc.btreemod<V>();
+                    up->pushBack(r, k, ordering, keepLoc);
+                }
+
+                DiskLoc nextLoc = x->tempNext(); // get next in chain at current level
+                if ( keepX ) {
+                    x->parent = upLoc;
+                }
+                else {
+		  if ( !x->nextChild.isNull() ) {
+		    DiskLoc ll = x->nextChild;
+		    ll.btreemod<V>()->parent = upLoc;
+		    //(x->nextChild.btreemod<V>())->parent = upLoc;
+		  }
+		  x->deallocBucket( xloc, idx );
+                }
+                xloc = nextLoc;
+            }
+
+            loc = upStart;
+            mayCommitProgressDurably();
+        }
+
+        if( levels > 1 )
+            log(2) << "btree levels: " << levels << endl;
+    }
+
+    /** when all addKeys are done, we then build the higher levels of the tree */
+    template<class V>
+    void BtreeBuilder<V>::commit() {
+        buildNextLevel(first);
+        committed = true;
+    }
+
+    template<class V>
+    BtreeBuilder<V>::~BtreeBuilder() {
+        DESTRUCTOR_GUARD(
+            if( !committed ) {
+                log(2) << "Rolling back partially built index space" << endl;
+                DiskLoc x = first;
+                while( !x.isNull() ) {
+                    DiskLoc next = x.btree<V>()->tempNext();
+                    string ns = idx.indexNamespace();
+                    theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), x.rec(), x);
+                    x = next;
+                    getDur().commitIfNeeded();
+                }
+                assert( idx.head.isNull() );
+                log(2) << "done rollback" << endl;
+            }
+        )
+    }
+
+    template class BtreeBuilder<V0>;
+    template class BtreeBuilder<V1>;
+
+}
diff --git a/src/mongo/db/btreebuilder.h b/src/mongo/db/btreebuilder.h
new file mode 100644
index 00000000000..6de55d89299
--- /dev/null
+++ b/src/mongo/db/btreebuilder.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "btree.h"
+
+namespace mongo {
+
+    /**
+     * build btree from the bottom up
+     */
+    template< class V >
+    class BtreeBuilder {
+        typedef typename V::KeyOwned KeyOwned;
+        typedef typename V::Key Key;
+        
+        bool dupsAllowed;
+        IndexDetails& idx;
+        /** Number of keys added to btree. */
+        unsigned long long n;
+        /** Last key passed to addKey(). */
+        auto_ptr< typename V::KeyOwned > keyLast;
+        BSONObj order;
+        Ordering ordering;
+        /** true iff commit() completed successfully. */
+        bool committed;
+
+        DiskLoc cur, first;
+        BtreeBucket<V> *b;
+
+        void newBucket();
+        void buildNextLevel(DiskLoc);
+        void mayCommitProgressDurably();
+
+    public:
+        ~BtreeBuilder();
+
+        BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx);
+
+        /**
+         * Preconditions: 'key' is > or >= last key passed to this function (depends on _dupsAllowed)
+         * Postconditions: 'key' is added to intermediate storage.
+         */
+        void addKey(BSONObj& key, DiskLoc loc);
+
+        /**
+         * commit work.  if not called, destructor will clean up partially completed work
+         *  (in case exception has happened).
+         */
+        void commit();
+
+        unsigned long long getn() { return n; }
+    };
+
+}
diff --git a/src/mongo/db/btreecursor.cpp b/src/mongo/db/btreecursor.cpp
new file mode 100644
index 00000000000..7ddd4874ef6
--- /dev/null
+++ b/src/mongo/db/btreecursor.cpp
@@ -0,0 +1,457 @@
+// btreecursor.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "curop-inl.h"
+#include "queryutil.h"
+
+namespace mongo {
+
+    template< class V >
+    class BtreeCursorImpl : public BtreeCursor { 
+    public:
+        typedef typename BucketBasics<V>::KeyNode KeyNode;
+        typedef typename V::Key Key;
+        typedef typename V::_KeyNode _KeyNode;
+
+        BtreeCursorImpl(NamespaceDetails *a, int b, const IndexDetails& c, const BSONObj &d, const BSONObj &e, bool f, int g) : 
+          BtreeCursor(a,b,c,d,e,f,g) { }
+        BtreeCursorImpl(NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction ) :
+          BtreeCursor(_d,_idxNo,_id,_bounds,_direction )
+        { 
+            pair< DiskLoc, int > noBestParent;
+            indexDetails.head.btree<V>()->customLocate( bucket, keyOfs, startKey, 0, false, _boundsIterator->cmp(), _boundsIterator->inc(), _ordering, _direction, noBestParent );
+            skipAndCheck();
+            dassert( _dups.size() == 0 );
+        }
+
+        virtual DiskLoc currLoc() { 
+            if( bucket.isNull() ) return DiskLoc();
+            return currKeyNode().recordLoc;
+        }
+
+        virtual BSONObj keyAt(int ofs) const { 
+            assert( !bucket.isNull() );
+            const BtreeBucket<V> *b = bucket.btree<V>();
+            int n = b->getN();
+            if( n == 0xffff ) { 
+                throw UserException(15850, "keyAt bucket deleted");
+            }
+            dassert( n >= 0 && n < 10000 );
+            return ofs >= n ? BSONObj() : b->keyNode(ofs).key.toBson();
+        }
+
+        virtual BSONObj currKey() const { 
+            assert( !bucket.isNull() );
+            return bucket.btree<V>()->keyNode(keyOfs).key.toBson();
+        }
+
+        virtual bool curKeyHasChild() { 
+            return !currKeyNode().prevChildBucket.isNull();
+        }
+
+        bool skipUnusedKeys() {
+            int u = 0;
+            while ( 1 ) {
+                if ( !ok() )
+                    break;
+                const _KeyNode& kn = keyNode(keyOfs);
+                if ( kn.isUsed() )
+                    break;
+                bucket = _advance(bucket, keyOfs, _direction, "skipUnusedKeys");
+                u++;
+                //don't include unused keys in nscanned
+                //++_nscanned;
+            }
+            if ( u > 10 )
+                OCCASIONALLY log() << "btree unused skipped:" << u << '\n';
+            return u;
+        }
+
+        /* Since the last noteLocation(), our key may have moved around, and that old cached
+           information may thus be stale and wrong (although often it is right).  We check
+           that here; if we have moved, we have to search back for where we were at.
+
+           i.e., after operations on the index, the BtreeCursor's cached location info may
+           be invalid.  This function ensures validity, so you should call it before using
+           the cursor if other writers have used the database since the last noteLocation
+           call.
+        */
+        void checkLocation() {
+            if ( eof() )
+                return;
+
+            _multikey = d->isMultikey(idxNo);
+
+            if ( keyOfs >= 0 ) {
+                assert( !keyAtKeyOfs.isEmpty() );
+
+                try {
+                    // Note keyAt() returns an empty BSONObj if keyOfs is now out of range,
+                    // which is possible as keys may have been deleted.
+                    int x = 0;
+                    while( 1 ) {
+                        //  if ( b->keyAt(keyOfs).woEqual(keyAtKeyOfs) &&
+                        //       b->k(keyOfs).recordLoc == locAtKeyOfs ) {
+                        if ( keyAt(keyOfs).binaryEqual(keyAtKeyOfs) ) {
+                            const _KeyNode& kn = keyNode(keyOfs);
+                            if( kn.recordLoc == locAtKeyOfs ) {
+                                if ( !kn.isUsed() ) {
+                                    // we were deleted but still exist as an unused
+                                    // marker key. advance.
+                                    skipUnusedKeys();
+                                }
+                                return;
+                            }
+                        }
+
+                        // we check one key earlier too, in case a key was just deleted.  this is
+                        // important so that multi updates are reasonably fast.
+                        if( keyOfs == 0 || x++ )
+                            break;
+                        keyOfs--;
+                    }
+                }
+                catch(UserException& e) { 
+                    if( e.getCode() != 15850 )
+                        throw;
+                    // hack: fall through if bucket was just deleted. should only happen under deleteObjects()
+                    DEV log() << "debug info: bucket was deleted" << endl;
+                }
+            }
+
+            /* normally we don't get to here.  when we do, old position is no longer
+                valid and we must refind where we left off (which is expensive)
+            */
+
+            /* TODO: Switch to keep indexdetails and do idx.head! */
+            bucket = _locate(keyAtKeyOfs, locAtKeyOfs);
+            RARELY log() << "key seems to have moved in the index, refinding. " << bucket.toString() << endl;
+            if ( ! bucket.isNull() )
+                skipUnusedKeys();
+
+        }
+    
+    protected:
+        virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) {
+            thisLoc.btree<V>()->advanceTo(thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction);
+        }
+        virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) {
+            return thisLoc.btree<V>()->advance(thisLoc, keyOfs, direction, caller);
+        }
+        virtual void _audit() {
+            out() << "BtreeCursor(). dumping head bucket" << endl;
+            indexDetails.head.btree<V>()->dump();
+        }
+        virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc) {
+            bool found;
+            return indexDetails.head.btree<V>()->
+                     locate(indexDetails, indexDetails.head, key, _ordering, keyOfs, found, loc, _direction);
+        }
+
+        const _KeyNode& keyNode(int keyOfs) const { 
+            return bucket.btree<V>()->k(keyOfs);
+        }
+
+    private:
+        const KeyNode currKeyNode() const {
+            assert( !bucket.isNull() );
+            const BtreeBucket<V> *b = bucket.btree<V>();
+            return b->keyNode(keyOfs);
+        }
+    };
+
+    template class BtreeCursorImpl<V0>;
+    template class BtreeCursorImpl<V1>;
+
+    /*
+    class BtreeCursorV1 : public BtreeCursor { 
+    public:
+        typedef BucketBasics<V1>::KeyNode KeyNode;
+        typedef V1::Key Key;
+
+        BtreeCursorV1(NamespaceDetails *a, int b, const IndexDetails& c, const BSONObj &d, const BSONObj &e, bool f, int g) : 
+          BtreeCursor(a,b,c,d,e,f,g) { }
+        BtreeCursorV1(NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction) : 
+          BtreeCursor(_d,_idxNo,_id,_bounds,_direction) 
+        { 
+            pair< DiskLoc, int > noBestParent;
+            indexDetails.head.btree<V1>()->customLocate( bucket, keyOfs, startKey, 0, false, _boundsIterator->cmp(), _boundsIterator->inc(), _ordering, _direction, noBestParent );
+            skipAndCheck();
+            dassert( _dups.size() == 0 );
+        }
+
+        virtual DiskLoc currLoc() { 
+            if( bucket.isNull() ) return DiskLoc();
+            return currKeyNode().recordLoc;
+        }
+
+        virtual BSONObj currKey() const { 
+            assert( !bucket.isNull() );
+            return bucket.btree<V1>()->keyNode(keyOfs).key.toBson();
+        }
+
+    protected:
+        virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) {
+            thisLoc.btree<V1>()->advanceTo(thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction);
+        }
+        virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) {
+            return thisLoc.btree<V1>()->advance(thisLoc, keyOfs, direction, caller);
+        }
+        virtual void _audit() {
+            out() << "BtreeCursor(). dumping head bucket" << endl;
+            indexDetails.head.btree<V1>()->dump();
+        }
+        virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc);
+        virtual const _KeyNode& keyNode(int keyOfs) { 
+            return bucket.btree<V1>()->k(keyOfs);
+        }
+
+    private:
+        const KeyNode currKeyNode() const {
+            assert( !bucket.isNull() );
+            const BtreeBucket<V1> *b = bucket.btree<V1>();
+            return b->keyNode(keyOfs);
+        }
+    };*/
+
+    BtreeCursor* BtreeCursor::make(
+        NamespaceDetails *_d, const IndexDetails& _id,
+        const shared_ptr< FieldRangeVector > &_bounds, int _direction )
+    {
+        return make( _d, _d->idxNo( (IndexDetails&) _id), _id, _bounds, _direction );
+    }
+
+    BtreeCursor* BtreeCursor::make(
+        NamespaceDetails *_d, const IndexDetails& _id,
+        const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction)
+    {
+        return make( _d, _d->idxNo( (IndexDetails&) _id), _id, startKey, endKey, endKeyInclusive, direction );
+    }
+
+
+    BtreeCursor* BtreeCursor::make(
+        NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, 
+        const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction) 
+    { 
+        int v = _id.version();
+        BtreeCursor *c = 0;
+        if( v == 1 ) {
+            c = new BtreeCursorImpl<V1>(_d,_idxNo,_id,startKey,endKey,endKeyInclusive,direction);
+        }
+        else if( v == 0 ) {
+            c = new BtreeCursorImpl<V0>(_d,_idxNo,_id,startKey,endKey,endKeyInclusive,direction);
+        }
+        else {
+            uasserted(14800, str::stream() << "unsupported index version " << v);
+        }
+        c->initWithoutIndependentFieldRanges();
+        dassert( c->_dups.size() == 0 );
+        return c;
+    }
+
+    BtreeCursor* BtreeCursor::make(
+        NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, 
+        const shared_ptr< FieldRangeVector > &_bounds, int _direction )
+    {
+        int v = _id.version();
+        if( v == 1 )
+            return new BtreeCursorImpl<V1>(_d,_idxNo,_id,_bounds,_direction);
+        if( v == 0 )
+            return new BtreeCursorImpl<V0>(_d,_idxNo,_id,_bounds,_direction);
+        uasserted(14801, str::stream() << "unsupported index version " << v);
+
+        // just check we are in sync with this method
+        dassert( IndexDetails::isASupportedIndexVersionNumber(v) );
+
+        return 0;
+    }
+
+    BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails &_id,
+                              const BSONObj &_startKey, const BSONObj &_endKey, bool endKeyInclusive, int _direction ) :
+        d(_d), idxNo(_idxNo),
+        startKey( _startKey ),
+        endKey( _endKey ),
+        _endKeyInclusive( endKeyInclusive ),
+        _multikey( d->isMultikey( idxNo ) ),
+        indexDetails( _id ),
+        _order( _id.keyPattern() ),
+        _ordering( Ordering::make( _order ) ),
+        _direction( _direction ),
+        _independentFieldRanges( false ),
+        _nscanned( 0 ) {
+        audit();
+    }
+
+    BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction )
+        :
+        d(_d), idxNo(_idxNo),
+        _endKeyInclusive( true ),
+        _multikey( d->isMultikey( idxNo ) ),
+        indexDetails( _id ),
+        _order( _id.keyPattern() ),
+        _ordering( Ordering::make( _order ) ),
+        _direction( _direction ),
+        _bounds( ( assert( _bounds.get() ), _bounds ) ),
+        _boundsIterator( new FieldRangeVectorIterator( *_bounds  ) ),
+        _independentFieldRanges( true ),
+        _nscanned( 0 ) {
+        audit();
+        startKey = _bounds->startKey();
+        _boundsIterator->advance( startKey ); // handles initialization
+        _boundsIterator->prepDive();
+        bucket = indexDetails.head;
+        keyOfs = 0;
+    }
+
+    /** Properly destroy forward declared class members. */
+    BtreeCursor::~BtreeCursor() {}
+    
+    void BtreeCursor::audit() {
+        dassert( d->idxNo((IndexDetails&) indexDetails) == idxNo );
+    }
+
+    void BtreeCursor::initWithoutIndependentFieldRanges() {
+        if ( indexDetails.getSpec().getType() ) {
+            startKey = indexDetails.getSpec().getType()->fixKey( startKey );
+            endKey = indexDetails.getSpec().getType()->fixKey( endKey );
+        }
+        bucket = _locate(startKey, _direction > 0 ? minDiskLoc : maxDiskLoc);
+        if ( ok() ) {
+            _nscanned = 1;
+        }
+        skipUnusedKeys();
+        checkEnd();
+    }
+
+    void BtreeCursor::skipAndCheck() {
+        long long startNscanned = _nscanned;
+        skipUnusedKeys();
+        while( 1 ) {
+            if ( !skipOutOfRangeKeysAndCheckEnd() ) {
+                break;
+            }
+            do {
+                if ( _nscanned > startNscanned + 20 ) {
+                    skipUnusedKeys();
+                    return;
+                }
+            } while( skipOutOfRangeKeysAndCheckEnd() );
+            if ( !skipUnusedKeys() ) {
+                break;
+            }
+        }
+    }
+
+    bool BtreeCursor::skipOutOfRangeKeysAndCheckEnd() {
+        if ( !ok() ) {
+            return false;
+        }
+        int ret = _boundsIterator->advance( currKey() );
+        if ( ret == -2 ) {
+            bucket = DiskLoc();
+            return false;
+        }
+        else if ( ret == -1 ) {
+            ++_nscanned;
+            return false;
+        }
+        ++_nscanned;
+        advanceTo( currKey(), ret, _boundsIterator->after(), _boundsIterator->cmp(), _boundsIterator->inc() );
+        return true;
+    }
+
+    // Return a value in the set {-1, 0, 1} to represent the sign of parameter i.
+    int sgn( int i ) {
+        if ( i == 0 )
+            return 0;
+        return i > 0 ? 1 : -1;
+    }
+
+    // Check if the current key is beyond endKey.
+    void BtreeCursor::checkEnd() {
+        if ( bucket.isNull() )
+            return;
+        if ( !endKey.isEmpty() ) {
+            int cmp = sgn( endKey.woCompare( currKey(), _order ) );
+            if ( ( cmp != 0 && cmp != _direction ) ||
+                    ( cmp == 0 && !_endKeyInclusive ) )
+                bucket = DiskLoc();
+        }
+    }
+
+    void BtreeCursor::advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive) {
+        _advanceTo( bucket, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, _ordering, _direction );
+    }
+
+    bool BtreeCursor::advance() {
+        killCurrentOp.checkForInterrupt();
+        if ( bucket.isNull() )
+            return false;
+
+        bucket = _advance(bucket, keyOfs, _direction, "BtreeCursor::advance");
+
+        if ( !_independentFieldRanges ) {
+            skipUnusedKeys();
+            checkEnd();
+            if ( ok() ) {
+                ++_nscanned;
+            }
+        }
+        else {
+            skipAndCheck();
+        }
+        return ok();
+    }
+
+    void BtreeCursor::noteLocation() {
+        if ( !eof() ) {
+            BSONObj o = currKey().getOwned();
+            keyAtKeyOfs = o;
+            locAtKeyOfs = currLoc();
+        }
+    }
+
+    string BtreeCursor::toString() {
+        string s = string("BtreeCursor ") + indexDetails.indexName();
+        if ( _direction < 0 ) s += " reverse";
+        if ( _bounds.get() && _bounds->size() > 1 ) s += " multi";
+        return s;
+    }
+    
+    BSONObj BtreeCursor::prettyIndexBounds() const {
+        if ( !_independentFieldRanges ) {
+            return BSON( "start" << prettyKey( startKey ) << "end" << prettyKey( endKey ) );
+        }
+        else {
+            return _bounds->obj();
+        }
+    }    
+
+    /* ----------------------------------------------------------------------------- */
+
+    struct BtreeCursorUnitTest {
+        BtreeCursorUnitTest() {
+            assert( minDiskLoc.compare(maxDiskLoc) < 0 );
+        }
+    } btut;
+
+} // namespace mongo
diff --git a/src/mongo/db/cap.cpp b/src/mongo/db/cap.cpp
new file mode 100644
index 00000000000..a8be2383115
--- /dev/null
+++ b/src/mongo/db/cap.cpp
@@ -0,0 +1,457 @@
+// @file cap.cpp capped collection related
+// the "old" version (<= v1.6)
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "db.h"
+#include "../util/mmap.h"
+#include "../util/hashtab.h"
+#include "../scripting/engine.h"
+#include "btree.h"
+#include <algorithm>
+#include <list>
+#include "json.h"
+#include "clientcursor.h"
+
+/*
+ capped collection layout
+
+ d's below won't exist if things align perfectly:
+
+ extent1             -> extent2                 -> extent3
+ -------------------    -----------------------    ---------------------
+ d r r r r r r r r d    d r r r r d r r r r r d    d r r r r r r r r r d
+                                ^   ^
+                           oldest   newest
+
+                        ^cappedFirstDeletedInCurExtent()
+                   ^cappedLastDelRecLastExtent()
+ ^cappedListOfAllDeletedRecords()
+*/
+
+
+namespace mongo {
+
+    /* combine adjacent deleted records *for the current extent* of the capped collection
+
+       this is O(n^2) but we call it for capped tables where typically n==1 or 2!
+       (or 3...there will be a little unused sliver at the end of the extent.)
+    */
+    void NamespaceDetails::compact() {
+        assert(capped);
+
+        list<DiskLoc> drecs;
+
+        // Pull out capExtent's DRs from deletedList
+        DiskLoc i = cappedFirstDeletedInCurExtent();
+        for (; !i.isNull() && inCapExtent( i ); i = i.drec()->nextDeleted )
+            drecs.push_back( i );
+
+        getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = i;
+
+        // This is the O(n^2) part.
+        drecs.sort();
+
+        list<DiskLoc>::iterator j = drecs.begin();
+        assert( j != drecs.end() );
+        DiskLoc a = *j;
+        while ( 1 ) {
+            j++;
+            if ( j == drecs.end() ) {
+                DEBUGGING out() << "TEMP: compact adddelrec\n";
+                addDeletedRec(a.drec(), a);
+                break;
+            }
+            DiskLoc b = *j;
+            while ( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders == b.getOfs() ) {
+                // a & b are adjacent.  merge.
+                getDur().writingInt( a.drec()->lengthWithHeaders ) += b.drec()->lengthWithHeaders;
+                j++;
+                if ( j == drecs.end() ) {
+                    DEBUGGING out() << "temp: compact adddelrec2\n";
+                    addDeletedRec(a.drec(), a);
+                    return;
+                }
+                b = *j;
+            }
+            DEBUGGING out() << "temp: compact adddelrec3\n";
+            addDeletedRec(a.drec(), a);
+            a = b;
+        }
+    }
+
+    DiskLoc &NamespaceDetails::cappedFirstDeletedInCurExtent() {
+        if ( cappedLastDelRecLastExtent().isNull() )
+            return cappedListOfAllDeletedRecords();
+        else
+            return cappedLastDelRecLastExtent().drec()->nextDeleted;
+    }
+
+    void NamespaceDetails::cappedCheckMigrate() {
+        // migrate old NamespaceDetails format
+        assert( capped );
+        if ( capExtent.a() == 0 && capExtent.getOfs() == 0 ) {
+            //capFirstNewRecord = DiskLoc();
+            capFirstNewRecord.writing().setInvalid();
+            // put all the DeletedRecords in cappedListOfAllDeletedRecords()
+            for ( int i = 1; i < Buckets; ++i ) {
+                DiskLoc first = deletedList[ i ];
+                if ( first.isNull() )
+                    continue;
+                DiskLoc last = first;
+                for (; !last.drec()->nextDeleted.isNull(); last = last.drec()->nextDeleted );
+                last.drec()->nextDeleted.writing() = cappedListOfAllDeletedRecords();
+                cappedListOfAllDeletedRecords().writing() = first;
+                deletedList[i].writing() = DiskLoc();
+            }
+            // NOTE cappedLastDelRecLastExtent() set to DiskLoc() in above
+
+            // Last, in case we're killed before getting here
+            capExtent.writing() = firstExtent;
+        }
+    }
+
+    bool NamespaceDetails::inCapExtent( const DiskLoc &dl ) const {
+        assert( !dl.isNull() );
+        // We could have a rec or drec, doesn't matter.
+        bool res = dl.drec()->myExtentLoc(dl) == capExtent;
+        DEV {
+            // old implementation. this check is temp to test works the same.  new impl should be a little faster.
+            assert( res == (dl.drec()->myExtent( dl ) == capExtent.ext()) );
+        }
+        return res;
+    }
+
+    bool NamespaceDetails::nextIsInCapExtent( const DiskLoc &dl ) const {
+        assert( !dl.isNull() );
+        DiskLoc next = dl.drec()->nextDeleted;
+        if ( next.isNull() )
+            return false;
+        return inCapExtent( next );
+    }
+
+    void NamespaceDetails::advanceCapExtent( const char *ns ) {
+        // We want cappedLastDelRecLastExtent() to be the last DeletedRecord of the prev cap extent
+        // (or DiskLoc() if new capExtent == firstExtent)
+        if ( capExtent == lastExtent )
+            getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc();
+        else {
+            DiskLoc i = cappedFirstDeletedInCurExtent();
+            for (; !i.isNull() && nextIsInCapExtent( i ); i = i.drec()->nextDeleted );
+            getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = i;
+        }
+
+        getDur().writingDiskLoc( capExtent ) = theCapExtent()->xnext.isNull() ? firstExtent : theCapExtent()->xnext;
+
+        /* this isn't true if a collection has been renamed...that is ok just used for diagnostics */
+        //dassert( theCapExtent()->ns == ns );
+
+        theCapExtent()->assertOk();
+        getDur().writingDiskLoc( capFirstNewRecord ) = DiskLoc();
+    }
+
+    DiskLoc NamespaceDetails::__capAlloc( int len ) {
+        DiskLoc prev = cappedLastDelRecLastExtent();
+        DiskLoc i = cappedFirstDeletedInCurExtent();
+        DiskLoc ret;
+        for (; !i.isNull() && inCapExtent( i ); prev = i, i = i.drec()->nextDeleted ) {
+            // We need to keep at least one DR per extent in cappedListOfAllDeletedRecords(),
+            // so make sure there's space to create a DR at the end.
+            if ( i.drec()->lengthWithHeaders >= len + 24 ) {
+                ret = i;
+                break;
+            }
+        }
+
+        /* unlink ourself from the deleted list */
+        if ( !ret.isNull() ) {
+            if ( prev.isNull() )
+                cappedListOfAllDeletedRecords().writing() = ret.drec()->nextDeleted;
+            else
+                prev.drec()->nextDeleted.writing() = ret.drec()->nextDeleted;
+            ret.drec()->nextDeleted.writing().setInvalid(); // defensive.
+            assert( ret.drec()->extentOfs < ret.getOfs() );
+        }
+
+        return ret;
+    }
+
+    DiskLoc NamespaceDetails::cappedAlloc(const char *ns, int len) {
+        // signal done allocating new extents.
+        if ( !cappedLastDelRecLastExtent().isValid() )
+            getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc();
+
+        assert( len < 400000000 );
+        int passes = 0;
+        int maxPasses = ( len / 30 ) + 2; // 30 is about the smallest entry that could go in the oplog
+        if ( maxPasses < 5000 ) {
+            // this is for bacwards safety since 5000 was the old value
+            maxPasses = 5000;
+        }
+        DiskLoc loc;
+
+        // delete records until we have room and the max # objects limit achieved.
+
+        /* this fails on a rename -- that is ok but must keep commented out */
+        //assert( theCapExtent()->ns == ns );
+
+        theCapExtent()->assertOk();
+        DiskLoc firstEmptyExtent;
+        while ( 1 ) {
+            if ( stats.nrecords < max ) {
+                loc = __capAlloc( len );
+                if ( !loc.isNull() )
+                    break;
+            }
+
+            // If on first iteration through extents, don't delete anything.
+            if ( !capFirstNewRecord.isValid() ) {
+                advanceCapExtent( ns );
+
+                if ( capExtent != firstExtent )
+                    capFirstNewRecord.writing().setInvalid();
+                // else signal done with first iteration through extents.
+                continue;
+            }
+
+            if ( !capFirstNewRecord.isNull() &&
+                    theCapExtent()->firstRecord == capFirstNewRecord ) {
+                // We've deleted all records that were allocated on the previous
+                // iteration through this extent.
+                advanceCapExtent( ns );
+                continue;
+            }
+
+            if ( theCapExtent()->firstRecord.isNull() ) {
+                if ( firstEmptyExtent.isNull() )
+                    firstEmptyExtent = capExtent;
+                advanceCapExtent( ns );
+                if ( firstEmptyExtent == capExtent ) {
+                    maybeComplain( ns, len );
+                    return DiskLoc();
+                }
+                continue;
+            }
+
+            DiskLoc fr = theCapExtent()->firstRecord;
+            theDataFileMgr.deleteRecord(ns, fr.rec(), fr, true); // ZZZZZZZZZZZZ
+            compact();
+            if( ++passes > maxPasses ) {
+                log() << "passes ns:" << ns << " len:" << len << " maxPasses: " << maxPasses << '\n';
+                log() << "passes max:" << max << " nrecords:" << stats.nrecords << " datasize: " << stats.datasize << endl;
+                massert( 10345 ,  "passes >= maxPasses in capped collection alloc", false );
+            }
+        }
+
+        // Remember first record allocated on this iteration through capExtent.
+        if ( capFirstNewRecord.isValid() && capFirstNewRecord.isNull() )
+            getDur().writingDiskLoc(capFirstNewRecord) = loc;
+
+        return loc;
+    }
+
+    void NamespaceDetails::dumpExtents() {
+        cout << "dumpExtents:" << endl;
+        for ( DiskLoc i = firstExtent; !i.isNull(); i = i.ext()->xnext ) {
+            Extent *e = i.ext();
+            stringstream ss;
+            e->dump(ss);
+            cout << ss.str() << endl;
+        }
+    }
+
+    void NamespaceDetails::cappedDumpDelInfo() {
+        cout << "dl[0]: " << deletedList[0].toString() << endl;
+        for( DiskLoc z = deletedList[0]; !z.isNull(); z = z.drec()->nextDeleted ) {
+            cout << "  drec:" << z.toString() << " dreclen:" << hex << z.drec()->lengthWithHeaders <<
+                 " ext:" << z.drec()->myExtent(z)->myLoc.toString() << endl;
+        }
+        cout << "dl[1]: " << deletedList[1].toString() << endl;
+    }
+
+    void NamespaceDetails::cappedTruncateLastDelUpdate() {
+        if ( capExtent == firstExtent ) {
+            // Only one extent of the collection is in use, so there
+            // is no deleted record in a previous extent, so nullify
+            // cappedLastDelRecLastExtent().
+            cappedLastDelRecLastExtent().writing() = DiskLoc();
+        }
+        else {
+            // Scan through all deleted records in the collection
+            // until the last deleted record for the extent prior
+            // to the new capExtent is found.  Then set
+            // cappedLastDelRecLastExtent() to that deleted record.
+            DiskLoc i = cappedListOfAllDeletedRecords();
+            for( ;
+                    !i.drec()->nextDeleted.isNull() &&
+                    !inCapExtent( i.drec()->nextDeleted );
+                    i = i.drec()->nextDeleted );
+            // In our capped storage model, every extent must have at least one
+            // deleted record.  Here we check that 'i' is not the last deleted
+            // record.  (We expect that there will be deleted records in the new
+            // capExtent as well.)
+            assert( !i.drec()->nextDeleted.isNull() );
+            cappedLastDelRecLastExtent().writing() = i;
+        }
+    }
+
+    void NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive) {
+        DEV assert( this == nsdetails(ns) );
+        assert( cappedLastDelRecLastExtent().isValid() );
+
+        // We iteratively remove the newest document until the newest document
+        // is 'end', then we remove 'end' if requested.
+        bool foundLast = false;
+        while( 1 ) {
+            if ( foundLast ) {
+                // 'end' has been found and removed, so break.
+                break;
+            }
+            getDur().commitIfNeeded();
+            // 'curr' will point to the newest document in the collection.
+            DiskLoc curr = theCapExtent()->lastRecord;
+            assert( !curr.isNull() );
+            if ( curr == end ) {
+                if ( inclusive ) {
+                    // 'end' has been found, so break next iteration.
+                    foundLast = true;
+                }
+                else {
+                    // 'end' has been found, so break.
+                    break;
+                }
+            }
+
+            // TODO The algorithm used in this function cannot generate an
+            // empty collection, but we could call emptyCappedCollection() in
+            // this case instead of asserting.
+            uassert( 13415, "emptying the collection is not allowed", stats.nrecords > 1 );
+
+            // Delete the newest record, and coalesce the new deleted
+            // record with existing deleted records.
+            theDataFileMgr.deleteRecord(ns, curr.rec(), curr, true);
+            compact();
+
+            // This is the case where we have not yet had to remove any
+            // documents to make room for other documents, and we are allocating
+            // documents from free space in fresh extents instead of reusing
+            // space from familiar extents.
+            if ( !capLooped() ) {
+
+                // We just removed the last record from the 'capExtent', and
+                // the 'capExtent' can't be empty, so we set 'capExtent' to
+                // capExtent's prev extent.
+                if ( theCapExtent()->lastRecord.isNull() ) {
+                    assert( !theCapExtent()->xprev.isNull() );
+                    // NOTE Because we didn't delete the last document, and
+                    // capLooped() is false, capExtent is not the first extent
+                    // so xprev will be nonnull.
+                    capExtent.writing() = theCapExtent()->xprev;
+                    theCapExtent()->assertOk();
+
+                    // update cappedLastDelRecLastExtent()
+                    cappedTruncateLastDelUpdate();
+                }
+                continue;
+            }
+
+            // This is the case where capLooped() is true, and we just deleted
+            // from capExtent, and we just deleted capFirstNewRecord, which was
+            // the last record on the fresh side of capExtent.
+            // NOTE In this comparison, curr and potentially capFirstNewRecord
+            // may point to invalid data, but we can still compare the
+            // references themselves.
+            if ( curr == capFirstNewRecord ) {
+
+                // Set 'capExtent' to the first nonempty extent prior to the
+                // initial capExtent.  There must be such an extent because we
+                // have not deleted the last document in the collection.  It is
+                // possible that all extents other than the capExtent are empty.
+                // In this case we will keep the initial capExtent and specify
+                // that all records contained within are on the fresh rather than
+                // stale side of the extent.
+                DiskLoc newCapExtent = capExtent;
+                do {
+                    // Find the previous extent, looping if necessary.
+                    newCapExtent = ( newCapExtent == firstExtent ) ? lastExtent : newCapExtent.ext()->xprev;
+                    newCapExtent.ext()->assertOk();
+                }
+                while ( newCapExtent.ext()->firstRecord.isNull() );
+                capExtent.writing() = newCapExtent;
+
+                // Place all documents in the new capExtent on the fresh side
+                // of the capExtent by setting capFirstNewRecord to the first
+                // document in the new capExtent.
+                capFirstNewRecord.writing() = theCapExtent()->firstRecord;
+
+                // update cappedLastDelRecLastExtent()
+                cappedTruncateLastDelUpdate();
+            }
+        }
+    }
+
+    void NamespaceDetails::emptyCappedCollection( const char *ns ) {
+        DEV assert( this == nsdetails(ns) );
+        massert( 13424, "collection must be capped", capped );
+        massert( 13425, "background index build in progress", !indexBuildInProgress );
+        massert( 13426, "indexes present", nIndexes == 0 );
+
+        // Clear all references to this namespace.
+        ClientCursor::invalidate( ns );
+        NamespaceDetailsTransient::clearForPrefix( ns );
+
+        // Get a writeable reference to 'this' and reset all pertinent
+        // attributes.
+        NamespaceDetails *t = writingWithoutExtra();
+
+        t->cappedLastDelRecLastExtent() = DiskLoc();
+        t->cappedListOfAllDeletedRecords() = DiskLoc();
+
+        // preserve firstExtent/lastExtent
+        t->capExtent = firstExtent;
+        t->stats.datasize = stats.nrecords = 0;
+        // lastExtentSize preserve
+        // nIndexes preserve 0
+        // capped preserve true
+        // max preserve
+        t->paddingFactor = 1.0;
+        t->flags = 0;
+        t->capFirstNewRecord = DiskLoc();
+        t->capFirstNewRecord.setInvalid();
+        t->cappedLastDelRecLastExtent().setInvalid();
+        // dataFileVersion preserve
+        // indexFileVersion preserve
+        t->multiKeyIndexBits = 0;
+        t->reservedA = 0;
+        t->extraOffset = 0;
+        // indexBuildInProgress preserve 0
+        memset(t->reserved, 0, sizeof(t->reserved));
+
+        // Reset all existing extents and recreate the deleted list.
+        for( DiskLoc ext = firstExtent; !ext.isNull(); ext = ext.ext()->xnext ) {
+            DiskLoc prev = ext.ext()->xprev;
+            DiskLoc next = ext.ext()->xnext;
+            DiskLoc empty = ext.ext()->reuse( ns, true );
+            ext.ext()->xprev.writing() = prev;
+            ext.ext()->xnext.writing() = next;
+            addDeletedRec( empty.drec(), empty );
+        }
+    }
+
+}
diff --git a/src/mongo/db/client.cpp b/src/mongo/db/client.cpp
new file mode 100644
index 00000000000..92b78d87ee5
--- /dev/null
+++ b/src/mongo/db/client.cpp
@@ -0,0 +1,697 @@
+// client.cpp
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Client represents a connection to the database (the server-side) and corresponds
+   to an open socket (or logical connection if pooling on sockets) from a client.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "client.h"
+#include "curop-inl.h"
+#include "json.h"
+#include "security.h"
+#include "commands.h"
+#include "instance.h"
+#include "../s/d_logic.h"
+#include "dbwebserver.h"
+#include "../util/mongoutils/html.h"
+#include "../util/mongoutils/checksum.h"
+#include "../util/file_allocator.h"
+#include "repl/rs.h"
+#include "../scripting/engine.h"
+
+namespace mongo {
+  
+    Client* Client::syncThread;
+    mongo::mutex Client::clientsMutex("clientsMutex");
+    set<Client*> Client::clients; // always be in clientsMutex when manipulating this
+
+    TSP_DEFINE(Client, currentClient)
+
+#if defined(_DEBUG)
+    struct StackChecker;
+    ThreadLocalValue<StackChecker *> checker;
+
+    struct StackChecker { 
+        enum { SZ = 256 * 1024 };
+        char buf[SZ];
+        StackChecker() { 
+            checker.set(this);
+        }
+        void init() { 
+            memset(buf, 42, sizeof(buf)); 
+        }
+        static void check(const char *tname) { 
+            static int max;
+            StackChecker *sc = checker.get();
+            const char *p = sc->buf;
+            int i = 0;
+            for( ; i < SZ; i++ ) { 
+                if( p[i] != 42 )
+                    break;
+            }
+            int z = SZ-i;
+            if( z > max ) {
+                max = z;
+                log() << "thread " << tname << " stack usage was " << z << " bytes" << endl;
+            }
+            wassert( i > 16000 );
+        }
+    };
+#endif
+
+    /* each thread which does db operations has a Client object in TLS.
+       call this when your thread starts.
+    */
+#if defined _DEBUG
+    static unsigned long long nThreads = 0;
+    void assertStartingUp() { 
+        assert( nThreads <= 1 );
+    }
+#else
+    void assertStartingUp() { }
+#endif
+
+    Client& Client::initThread(const char *desc, AbstractMessagingPort *mp) {
+#if defined(_DEBUG)
+        { 
+            nThreads++; // never decremented.  this is for casi class asserts
+            if( sizeof(void*) == 8 ) {
+                StackChecker sc;
+                sc.init();
+            }
+        }
+#endif
+        assert( currentClient.get() == 0 );
+        Client *c = new Client(desc, mp);
+        currentClient.reset(c);
+        mongo::lastError.initThread();
+        return *c;
+    }
+
+    Client::Client(const char *desc, AbstractMessagingPort *p) :
+        _context(0),
+        _shutdown(false),
+        _desc(desc),
+        _god(0),
+        _lastOp(0),
+        _mp(p),
+        _sometimes(0)
+    {
+        _hasWrittenThisPass = false;
+        _pageFaultRetryableSection = 0;
+        _connectionId = setThreadName(desc);
+        _curOp = new CurOp( this );
+#ifndef _WIN32
+        stringstream temp;
+        temp << hex << showbase << pthread_self();
+        _threadId = temp.str();
+#endif
+        scoped_lock bl(clientsMutex);
+        clients.insert(this);
+    }
+
+    Client::~Client() {
+        _god = 0;
+
+        if ( _context )
+            error() << "Client::~Client _context should be null but is not; client:" << _desc << endl;
+
+        if ( ! _shutdown ) {
+            error() << "Client::shutdown not called: " << _desc << endl;
+        }
+
+        if ( ! inShutdown() ) {
+            // we can't clean up safely once we're in shutdown
+            scoped_lock bl(clientsMutex);
+            if ( ! _shutdown )
+                clients.erase(this);
+            delete _curOp;
+        }
+    }
+
+    bool Client::shutdown() {
+#if defined(_DEBUG)
+        { 
+            if( sizeof(void*) == 8 ) {
+                StackChecker::check( desc() );
+            }
+        }
+#endif
+        _shutdown = true;
+        if ( inShutdown() )
+            return false;
+        {
+            scoped_lock bl(clientsMutex);
+            clients.erase(this);
+            if ( isSyncThread() ) {
+                syncThread = 0;
+            }
+        }
+
+        return false;
+    }
+
+    BSONObj CachedBSONObj::_tooBig = fromjson("{\"$msg\":\"query not recording (too large)\"}");
+    Client::Context::Context( string ns , Database * db, bool doauth ) :
+        _client( currentClient.get() ), 
+        _oldContext( _client->_context ),
+        _path( mongo::dbpath ), // is this right? could be a different db? may need a dassert for this
+        _justCreated(false),
+        _ns( ns ), 
+        _db(db)
+    {
+        assert( db == 0 || db->isOk() );
+        _client->_context = this;
+        checkNsAccess( doauth );
+        _client->checkLocks();
+    }
+
+    Client::Context::Context(const string& ns, string path , bool doauth ) :
+        _client( currentClient.get() ), 
+        _oldContext( _client->_context ),
+        _path( path ), 
+        _justCreated(false), // set for real in finishInit
+        _ns( ns ), 
+        _db(0) 
+    {
+        _finishInit( doauth );
+        _client->checkLocks();
+    }
+       
+    /** "read lock, and set my context, all in one operation" 
+     *  This handles (if not recursively locked) opening an unopened database.
+     */
+    Client::ReadContext::ReadContext(const string& ns, string path, bool doauth ) {
+        {
+            lk.reset( new _LockCollectionForReading(ns) );
+            Database *db = dbHolder().get(ns, path);
+            if( db ) {
+                c.reset( new Context(path, ns, db, doauth) );
+                return;
+            }
+        }
+
+        // we usually don't get here, so doesn't matter how fast this part is
+        {
+            int x = d.dbMutex.getState();
+            if( x > 0 ) { 
+                // write locked already
+                DEV RARELY log() << "write locked on ReadContext construction " << ns << endl;
+                c.reset( new Context(ns, path, doauth) );
+            }
+            else if( x == -1 ) { 
+                lk.reset(0);
+                {
+                    writelock w;
+                    Context c(ns, path, doauth);
+                }
+                // db could be closed at this interim point -- that is ok, we will throw, and don't mind throwing.
+                lk.reset( new _LockCollectionForReading(ns) );
+                c.reset( new Context(ns, path, doauth) );
+            }
+            else { 
+                assert( x < -1 );
+                uasserted(15928, str::stream() << "can't open a database from a nested read lock " << ns);
+            }
+        }
+
+        // todo: are receipts of thousands of queries for a nonexisting database a potential 
+        //       cause of bad performance due to the write lock acquisition above?  let's fix that.
+        //       it would be easy to first check that there is at least a .ns file, or something similar.
+    }
+
+    void Client::Context::checkNotStale() const { 
+        switch ( _client->_curOp->getOp() ) {
+        case dbGetMore: // getMore's are special and should be handled else where
+        case dbUpdate: // update & delete check shard version in instance.cpp, so don't check here as well
+        case dbDelete:
+            break;
+        default: {
+            string errmsg;
+            if ( ! shardVersionOk( _ns , errmsg ) ) {
+                ostringstream os;
+                os << "[" << _ns << "] shard version not ok in Client::Context: " << errmsg;
+                throw SendStaleConfigException( _ns, os.str() );
+            }
+        }
+        }
+    }
+
+    // invoked from ReadContext
+    Client::Context::Context(const string& path, const string& ns, Database *db , bool doauth) :
+        _client( currentClient.get() ), 
+        _oldContext( _client->_context ),
+        _path( path ), 
+        _justCreated(false),
+        _ns( ns ), 
+        _db(db)
+    {
+        assert(_db);
+        checkNotStale();
+        _client->_context = this;
+        _client->_curOp->enter( this );
+        checkNsAccess( doauth, d.dbMutex.getState() );
+        _client->checkLocks();
+    }
+       
+    void Client::Context::_finishInit( bool doauth ) {
+        int lockState = d.dbMutex.getState();
+        assert( lockState );        
+        if ( lockState > 0 && FileAllocator::get()->hasFailed() ) {
+            uassert(14031, "Can't take a write lock while out of disk space", false);
+        }
+        
+        _db = dbHolderUnchecked().getOrCreate( _ns , _path , _justCreated );
+        assert(_db);
+        checkNotStale();
+        _client->_context = this;
+        _client->_curOp->enter( this );
+        checkNsAccess( doauth, lockState );
+    }
+
+    void Client::Context::_auth( int lockState ) {
+        if ( _client->_ai.isAuthorizedForLock( _db->name , lockState ) )
+            return;
+
+        // before we assert, do a little cleanup
+        _client->_context = _oldContext; // note: _oldContext may be null
+
+        stringstream ss;
+        ss << "unauthorized db:" << _db->name << " lock type:" << lockState << " client:" << _client->clientAddress();
+        uasserted( 10057 , ss.str() );
+    }
+    
+    Client::Context::~Context() {
+        DEV assert( _client == currentClient.get() );
+        _client->_curOp->leave( this );
+        _client->_context = _oldContext; // note: _oldContext may be null
+    }
+
+    bool Client::Context::inDB( const string& db , const string& path ) const {
+        if ( _path != path )
+            return false;
+
+        if ( db == _ns )
+            return true;
+
+        string::size_type idx = _ns.find( db );
+        if ( idx != 0 )
+            return false;
+
+        return  _ns[db.size()] == '.';
+    }
+    
+    void Client::Context::checkNsAccess( bool doauth, int lockState ) {
+        if ( 0 ) { // SERVER-4276
+            uassert( 15929, "client access to index backing namespace prohibited", NamespaceString::normal( _ns.c_str() ) );
+        }
+        if ( doauth ) {
+            _auth( lockState );
+        }
+    }
+
+    void Client::appendLastOp( BSONObjBuilder& b ) const {
+        // _lastOp is never set if replication is off
+        if( theReplSet || ! _lastOp.isNull() ) {
+            b.appendTimestamp( "lastOp" , _lastOp.asDate() );
+        }
+    }
+
+    string Client::clientAddress(bool includePort) const {
+        if( _curOp )
+            return _curOp->getRemoteString(includePort);
+        return "";
+    }
+
+    string Client::toString() const {
+        stringstream ss;
+        if ( _curOp )
+            ss << _curOp->infoNoauth().jsonString();
+        return ss.str();
+    }
+
+    string sayClientState() {
+        Client* c = currentClient.get();
+        if ( !c )
+            return "no client";
+        return c->toString();
+    }
+
+    Client* curopWaitingForLock( int type ) {
+        Client * c = currentClient.get();
+        assert( c );
+        CurOp * co = c->curop();
+        if ( co ) {
+            co->waitingForLock( type );
+        }
+        return c;
+    }
+    void curopGotLock(Client *c) {
+        assert(c);
+        CurOp * co = c->curop();
+        if ( co )
+            co->gotLock();
+    }
+
+    void KillCurrentOp::interruptJs( AtomicUInt *op ) {
+        if ( !globalScriptEngine )
+            return;
+        if ( !op ) {
+            globalScriptEngine->interruptAll();
+        }
+        else {
+            globalScriptEngine->interrupt( *op );
+        }
+    }
+
+    void KillCurrentOp::killAll() {
+        _globalKill = true;
+        interruptJs( 0 );
+    }
+
+    void KillCurrentOp::kill(AtomicUInt i) {
+        bool found = false;
+        {
+            scoped_lock l( Client::clientsMutex );
+            for( set< Client* >::const_iterator j = Client::clients.begin(); !found && j != Client::clients.end(); ++j ) {
+                for( CurOp *k = ( *j )->curop(); !found && k; k = k->parent() ) {
+                    if ( k->opNum() == i ) {
+                        k->kill();
+                        for( CurOp *l = ( *j )->curop(); l != k; l = l->parent() ) {
+                            l->kill();
+                        }
+                        found = true;
+                    }
+                }
+            }
+        }
+        if ( found ) {
+            interruptJs( &i );
+        }
+    }
+
+    void Client::gotHandshake( const BSONObj& o ) {
+        BSONObjIterator i(o);
+
+        {
+            BSONElement id = i.next();
+            assert( id.type() );
+            _remoteId = id.wrap( "_id" );
+        }
+
+        BSONObjBuilder b;
+        while ( i.more() )
+            b.append( i.next() );
+        
+        b.appendElementsUnique( _handshake );
+
+        _handshake = b.obj();
+
+        if (theReplSet && o.hasField("member")) {
+            theReplSet->ghost->associateSlave(_remoteId, o["member"].Int());
+        }
+    }
+
+    ClientBasic* ClientBasic::getCurrent() {
+        return currentClient.get();
+    }
+
+    class HandshakeCmd : public Command {
+    public:
+        void help(stringstream& h) const { h << "internal"; }
+        HandshakeCmd() : Command( "handshake" ) {}
+        virtual LockType locktype() const { return NONE; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return false; }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            Client& c = cc();
+            c.gotHandshake( cmdObj );
+            return 1;
+        }
+
+    } handshakeCmd;
+
+    class ClientListPlugin : public WebStatusPlugin {
+    public:
+        ClientListPlugin() : WebStatusPlugin( "clients" , 20 ) {}
+        virtual void init() {}
+
+        virtual void run( stringstream& ss ) {
+            using namespace mongoutils::html;
+
+            ss << "\n<table border=1 cellpadding=2 cellspacing=0>";
+            ss << "<tr align='left'>"
+               << th( a("", "Connections to the database, both internal and external.", "Client") )
+               << th( a("http://www.mongodb.org/display/DOCS/Viewing+and+Terminating+Current+Operation", "", "OpId") )
+               << "<th>Active</th>"
+               << "<th>LockType</th>"
+               << "<th>Waiting</th>"
+               << "<th>SecsRunning</th>"
+               << "<th>Op</th>"
+               << th( a("http://www.mongodb.org/display/DOCS/Developer+FAQ#DeveloperFAQ-What%27sa%22namespace%22%3F", "", "Namespace") )
+               << "<th>Query</th>"
+               << "<th>client</th>"
+               << "<th>msg</th>"
+               << "<th>progress</th>"
+
+               << "</tr>\n";
+            {
+                scoped_lock bl(Client::clientsMutex);
+                for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) {
+                    Client *c = *i;
+                    CurOp& co = *(c->curop());
+                    ss << "<tr><td>" << c->desc() << "</td>";
+
+                    tablecell( ss , co.opNum() );
+                    tablecell( ss , co.active() );
+                    {
+                        int lt = co.getLockType();
+                        if( lt == -1 ) tablecell(ss, "R");
+                        else if( lt == 1 ) tablecell(ss, "W");
+                        else
+                            tablecell( ss ,  lt);
+                    }
+                    tablecell( ss , co.isWaitingForLock() );
+                    if ( co.active() )
+                        tablecell( ss , co.elapsedSeconds() );
+                    else
+                        tablecell( ss , "" );
+                    tablecell( ss , co.getOp() );
+                    tablecell( ss , co.getNS() );
+                    if ( co.haveQuery() ) {
+                        tablecell( ss , co.query() );
+                    }
+                    else
+                        tablecell( ss , "" );
+                    tablecell( ss , co.getRemoteString() );
+
+                    tablecell( ss , co.getMessage() );
+                    tablecell( ss , co.getProgressMeter().toString() );
+
+
+                    ss << "</tr>\n";
+                }
+            }
+            ss << "</table>\n";
+
+        }
+
+    } clientListPlugin;
+
+    int Client::recommendedYieldMicros( int * writers , int * readers ) {
+        int num = 0;
+        int w = 0;
+        int r = 0;
+        {
+            scoped_lock bl(clientsMutex);
+            for ( set<Client*>::iterator i=clients.begin(); i!=clients.end(); ++i ) {
+                Client* c = *i;
+                if ( c->curop()->isWaitingForLock() ) {
+                    num++;
+                    if ( c->curop()->getLockType() > 0 )
+                        w++;
+                    else
+                        r++;
+                }
+            }
+        }
+
+        if ( writers )
+            *writers = w;
+        if ( readers )
+            *readers = r;
+
+        int time = r * 100;
+        time += w * 500;
+
+        time = min( time , 1000000 );
+
+        // if there has been a kill request for this op - we should yield to allow the op to stop
+        // This function returns empty string if we aren't interrupted
+        if ( *killCurrentOp.checkForInterruptNoAssert() ) {
+            return 100;
+        }
+
+        return time;
+    }
+
+    int Client::getActiveClientCount( int& writers, int& readers ) {
+        writers = 0;
+        readers = 0;
+
+        scoped_lock bl(clientsMutex);
+        for ( set<Client*>::iterator i=clients.begin(); i!=clients.end(); ++i ) {
+            Client* c = *i;
+            if ( ! c->curop()->active() )
+                continue;
+
+            int l = c->curop()->getLockType();
+            if ( l > 0 )
+                writers++;
+            else if ( l < 0 )
+                readers++;
+
+        }
+
+        return writers + readers;
+    }
+
+    void OpDebug::reset() {
+        extra.reset();
+
+        op = 0;
+        iscommand = false;
+        ns = "";
+        query = BSONObj();
+        updateobj = BSONObj();
+
+        cursorid = -1;
+        ntoreturn = -1;
+        ntoskip = -1;
+        exhaust = false;
+
+        nscanned = -1;
+        idhack = false;
+        scanAndOrder = false;
+        moved = false;
+        fastmod = false;
+        fastmodinsert = false;
+        upsert = false;
+        keyUpdates = 0;  // unsigned, so -1 not possible
+        
+        exceptionInfo.reset();
+        
+        executionTime = 0;
+        nreturned = -1;
+        responseLength = -1;
+    }
+
+
+#define OPDEBUG_TOSTRING_HELP(x) if( x >= 0 ) s << " " #x ":" << (x)
+#define OPDEBUG_TOSTRING_HELP_BOOL(x) if( x ) s << " " #x ":" << (x)
+    string OpDebug::toString() const {
+        StringBuilder s( ns.size() + 64 );
+        if ( iscommand )
+            s << "command ";
+        else
+            s << opToString( op ) << ' ';
+        s << ns.toString();
+
+        if ( ! query.isEmpty() ) {
+            if ( iscommand )
+                s << " command: ";
+            else
+                s << " query: ";
+            s << query.toString();
+        }
+        
+        if ( ! updateobj.isEmpty() ) {
+            s << " update: ";
+            updateobj.toString( s );
+        }
+        
+        OPDEBUG_TOSTRING_HELP( cursorid );
+        OPDEBUG_TOSTRING_HELP( ntoreturn );
+        OPDEBUG_TOSTRING_HELP( ntoskip );
+        OPDEBUG_TOSTRING_HELP_BOOL( exhaust );
+
+        OPDEBUG_TOSTRING_HELP( nscanned );
+        OPDEBUG_TOSTRING_HELP_BOOL( idhack );
+        OPDEBUG_TOSTRING_HELP_BOOL( scanAndOrder );
+        OPDEBUG_TOSTRING_HELP_BOOL( moved );
+        OPDEBUG_TOSTRING_HELP_BOOL( fastmod );
+        OPDEBUG_TOSTRING_HELP_BOOL( fastmodinsert );
+        OPDEBUG_TOSTRING_HELP_BOOL( upsert );
+        OPDEBUG_TOSTRING_HELP( keyUpdates );
+        
+        if ( extra.len() )
+            s << " " << extra.str();
+
+        if ( ! exceptionInfo.empty() ) {
+            s << " exception: " << exceptionInfo.msg;
+            if ( exceptionInfo.code )
+                s << " code:" << exceptionInfo.code;
+        }
+        
+        OPDEBUG_TOSTRING_HELP( nreturned );
+        if ( responseLength )
+            s << " reslen:" << responseLength;
+        s << " " << executionTime << "ms";
+
+        return s.str();
+    }
+
+#define OPDEBUG_APPEND_NUMBER(x) if( x != -1 ) b.append( #x , (x) )
+#define OPDEBUG_APPEND_BOOL(x) if( x ) b.appendBool( #x , (x) )
+    void OpDebug::append( const CurOp& curop, BSONObjBuilder& b ) const {
+        b.append( "op" , iscommand ? "command" : opToString( op ) );
+        b.append( "ns" , ns.toString() );
+        if ( ! query.isEmpty() )
+            b.append( iscommand ? "command" : "query" , query );
+        else if ( ! iscommand && curop.haveQuery() )
+            curop.appendQuery( b , "query" );
+
+        if ( ! updateobj.isEmpty() )
+            b.append( "updateobj" , updateobj );
+        
+        OPDEBUG_APPEND_NUMBER( cursorid );
+        OPDEBUG_APPEND_NUMBER( ntoreturn );
+        OPDEBUG_APPEND_NUMBER( ntoskip );
+        OPDEBUG_APPEND_BOOL( exhaust );
+
+        OPDEBUG_APPEND_NUMBER( nscanned );
+        OPDEBUG_APPEND_BOOL( idhack );
+        OPDEBUG_APPEND_BOOL( scanAndOrder );
+        OPDEBUG_APPEND_BOOL( moved );
+        OPDEBUG_APPEND_BOOL( fastmod );
+        OPDEBUG_APPEND_BOOL( fastmodinsert );
+        OPDEBUG_APPEND_BOOL( upsert );
+        OPDEBUG_APPEND_NUMBER( keyUpdates );
+
+        if ( ! exceptionInfo.empty() ) 
+            exceptionInfo.append( b , "exception" , "exceptionCode" );
+        
+        OPDEBUG_APPEND_NUMBER( nreturned );
+        OPDEBUG_APPEND_NUMBER( responseLength );
+        b.append( "millis" , executionTime );
+        
+    }
+
+}
diff --git a/src/mongo/db/client.h b/src/mongo/db/client.h
new file mode 100644
index 00000000000..6aa8bc00f02
--- /dev/null
+++ b/src/mongo/db/client.h
@@ -0,0 +1,286 @@
+/* @file db/client.h
+
+   "Client" represents a connection to the database (the server-side) and corresponds
+   to an open socket (or logical connection if pooling on sockets) from a client.
+
+   todo: switch to asio...this will fit nicely with that.
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "security.h"
+#include "namespace-inl.h"
+#include "lasterror.h"
+#include "stats/top.h"
+#include "../db/client_common.h"
+#include "../util/concurrency/threadlocal.h"
+#include "../util/net/message_port.h"
+#include "../util/concurrency/rwlock.h"
+#include "d_concurrency.h"
+
+namespace mongo {
+
+    extern class ReplSet *theReplSet;
+    class AuthenticationInfo;
+    class Database;
+    class CurOp;
+    class Command;
+    class Client;
+    class AbstractMessagingPort;
+    class LockCollectionForReading;
+    class PageFaultRetryableSection;
+
+#if defined(CLC)
+    typedef LockCollectionForReading _LockCollectionForReading;
+#else
+    typedef readlock _LockCollectionForReading;
+#endif
+
+    TSP_DECLARE(Client, currentClient)
+
+    typedef long long ConnectionId;
+
+    /** the database's concept of an outside "client" */
+    class Client : public ClientBasic {
+        static Client *syncThread;
+    public:
+        // always be in clientsMutex when manipulating this. killop stuff uses these.
+        static set<Client*> clients;      
+        static mongo::mutex clientsMutex; 
+        static int getActiveClientCount( int& writers , int& readers );
+        class Context;
+        ~Client();
+        static int recommendedYieldMicros( int * writers = 0 , int * readers = 0 );
+
+        /** each thread which does db operations has a Client object in TLS.
+         *  call this when your thread starts.
+        */
+        static Client& initThread(const char *desc, AbstractMessagingPort *mp = 0);
+
+        static void initThreadIfNotAlready(const char *desc) { 
+            if( currentClient.get() )
+                return;
+            initThread(desc);
+        }
+
+        /** this has to be called as the client goes away, but before thread termination
+         *  @return true if anything was done
+         */
+        bool shutdown();
+
+        /** set so isSyncThread() works */
+        void iAmSyncThread() {
+            wassert( syncThread == 0 );
+            syncThread = this;
+        }
+        /** @return true if this client is the replication secondary pull thread.  not used much, is used in create index sync code. */
+        bool isSyncThread() const { return this == syncThread; }
+
+        string clientAddress(bool includePort=false) const;
+        const AuthenticationInfo * getAuthenticationInfo() const { return &_ai; }
+        AuthenticationInfo * getAuthenticationInfo() { return &_ai; }
+        bool isAdmin() { return _ai.isAuthorized( "admin" ); }
+        CurOp* curop() const { return _curOp; }
+        Context* getContext() const { return _context; }
+        Database* database() const {  return _context ? _context->db() : 0; }
+        const char *ns() const { return _context->ns(); }
+        const char *desc() const { return _desc; }
+        void setLastOp( OpTime op ) { _lastOp = op; }
+        OpTime getLastOp() const { return _lastOp; }
+
+        /** caution -- use Context class instead */
+        void setContext(Context *c) { _context = c; }
+
+        /* report what the last operation was.  used by getlasterror */
+        void appendLastOp( BSONObjBuilder& b ) const;
+
+        bool isGod() const { return _god; } /* this is for map/reduce writes */
+        string toString() const;
+        void gotHandshake( const BSONObj& o );
+        bool hasRemote() const { return _mp; }
+        HostAndPort getRemote() const { assert( _mp ); return _mp->remote(); }
+        BSONObj getRemoteID() const { return _remoteId; }
+        BSONObj getHandshake() const { return _handshake; }
+        AbstractMessagingPort * port() const { return _mp; }
+        ConnectionId getConnectionId() const { return _connectionId; }
+    private:
+        Client(const char *desc, AbstractMessagingPort *p = 0);
+        friend class CurOp;
+        ConnectionId _connectionId; // > 0 for things "conn", 0 otherwise
+        string _threadId; // "" on non support systems
+        CurOp * _curOp;
+        Context * _context;
+        bool _shutdown; // to track if Client::shutdown() gets called
+        const char * const _desc;
+        bool _god;
+        AuthenticationInfo _ai;
+        OpTime _lastOp;
+        BSONObj _handshake;
+        BSONObj _remoteId;
+        AbstractMessagingPort * const _mp;
+        unsigned _sometimes;
+    public:
+        bool _hasWrittenThisPass;
+        PageFaultRetryableSection *_pageFaultRetryableSection;
+
+        /** the concept here is the same as MONGO_SOMETIMES.  however that 
+            macro uses a static that will be shared by all threads, and each 
+            time incremented it might eject that line from the other cpu caches (?),
+            so idea is that this is better.
+            */
+        bool sometimes(unsigned howOften) { return ++_sometimes % howOften == 0; }
+
+        /* set _god=true temporarily, safely */
+        class GodScope {
+            bool _prev;
+        public:
+            GodScope();
+            ~GodScope();
+        };
+
+        //static void assureDatabaseIsOpen(const string& ns, string path=dbpath);
+
+        /** "read lock, and set my context, all in one operation" 
+         *  This handles (if not recursively locked) opening an unopened database.
+         */
+        class ReadContext : boost::noncopyable { 
+        public:
+            ReadContext(const string& ns, string path=dbpath, bool doauth=true );
+            Context& ctx() { return *c.get(); }
+        private:
+            scoped_ptr<_LockCollectionForReading> lk;
+            scoped_ptr<Context> c;
+        };
+
+        /* Set database we want to use, then, restores when we finish (are out of scope)
+           Note this is also helpful if an exception happens as the state if fixed up.
+        */
+        class Context : boost::noncopyable {
+        public:
+            /** this is probably what you want */
+            Context(const string& ns, string path=dbpath, bool doauth=true );
+
+            /** note: this does not call finishInit -- i.e., does not call 
+                      shardVersionOk() for example. 
+                see also: reset().
+            */
+            Context( string ns , Database * db, bool doauth=true );
+
+            // used by ReadContext
+            Context(const string& path, const string& ns, Database *db, bool doauth);
+
+            ~Context();
+            Client* getClient() const { return _client; }
+            Database* db() const { return _db; }
+            const char * ns() const { return _ns.c_str(); }
+            bool equals( const string& ns , const string& path=dbpath ) const { return _ns == ns && _path == path; }
+
+            /** @return if the db was created by this Context */
+            bool justCreated() const { return _justCreated; }
+
+            /** @return true iff the current Context is using db/path */
+            bool inDB( const string& db , const string& path=dbpath ) const;
+
+            void _clear() { // this is sort of an "early destruct" indication, _ns can never be uncleared
+                const_cast<string&>(_ns).empty();
+                _db = 0;
+            }
+
+            /** call before unlocking, so clear any non-thread safe state
+             *  _db gets restored on the relock
+             */
+            void unlocked() { _db = 0; }
+
+            /** call after going back into the lock, will re-establish non-thread safe stuff */
+            void relocked() { _finishInit(); }
+
+        private:
+            friend class CurOp;
+            void _finishInit( bool doauth=true);
+            void _auth( int lockState );
+            void checkNotStale() const;
+            void checkNsAccess( bool doauth, int lockState = d.dbMutex.getState() );
+            Client * const _client;
+            Context * const _oldContext;
+            const string _path;
+            bool _justCreated;
+            const string _ns;
+            Database * _db;
+        }; // class Client::Context
+
+        struct LockStatus {
+            LockStatus();
+            string whichCollection;
+            unsigned excluder, global, collection;
+            string toString() const;
+        } lockStatus;
+
+#if defined(CLC)
+        void checkLocks() const;
+#else
+        void checkLocks() const { }
+#endif
+
+    }; // class Client
+
+    /** get the Client object for this thread. */
+    inline Client& cc() {
+        Client * c = currentClient.get();
+        assert( c );
+        return *c;
+    }
+
+    inline Client::GodScope::GodScope() {
+        _prev = cc()._god;
+        cc()._god = true;
+    }
+    inline Client::GodScope::~GodScope() { cc()._god = _prev; }
+
+    /* this unreadlocks and then writelocks; i.e. it does NOT upgrade inside the
+       lock (and is thus wrong to use if you need that, which is usually).
+       that said we use it today for a specific case where the usage is correct.
+    */
+#if 0
+    inline void mongolock::releaseAndWriteLock() {
+        if( !_writelock ) {
+
+#if BOOST_VERSION >= 103500
+            int s = d.dbMutex.getState();
+            if( s != -1 ) {
+                log() << "error: releaseAndWriteLock() s == " << s << endl;
+                msgasserted( 12600, "releaseAndWriteLock: unlock_shared failed, probably recursive" );
+            }
+#endif
+
+            _writelock = true;
+            d.dbMutex.unlock_shared();
+            d.dbMutex.lock();
+
+            // todo: unlocked() method says to call it before unlocking, not after.  so fix this here,
+            // or fix the doc there.
+            if ( cc().getContext() )
+                cc().getContext()->unlocked();
+        }
+    }
+#endif
+
+    inline bool haveClient() { return currentClient.get() > 0; }
+
+};
diff --git a/src/mongo/db/client_common.h b/src/mongo/db/client_common.h
new file mode 100644
index 00000000000..eb70105ef99
--- /dev/null
+++ b/src/mongo/db/client_common.h
@@ -0,0 +1,47 @@
+// client_common.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+//#include "../pch.h"
+//#include "security.h"
+#include "../util/net/hostandport.h"
+
+namespace mongo {
+
+    class AuthenticationInfo;
+    
+    /**
+     * this is the base class for Client and ClientInfo
+     * Client is for mongod
+     * Client is for mongos
+     * They should converge slowly
+     * The idea is this has the basic api so that not all code has to be duplicated
+     */
+    class ClientBasic : boost::noncopyable {
+    public:
+        virtual ~ClientBasic(){}
+        virtual const AuthenticationInfo * getAuthenticationInfo() const = 0;
+        virtual AuthenticationInfo * getAuthenticationInfo() = 0;
+
+        virtual bool hasRemote() const = 0;
+        virtual HostAndPort getRemote() const = 0;
+
+        static ClientBasic* getCurrent();
+    };
+}
diff --git a/src/mongo/db/clientcursor.cpp b/src/mongo/db/clientcursor.cpp
new file mode 100644
index 00000000000..dc04ec38f63
--- /dev/null
+++ b/src/mongo/db/clientcursor.cpp
@@ -0,0 +1,747 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* clientcursor.cpp
+
+   ClientCursor is a wrapper that represents a cursorid from our database
+   application's perspective.
+
+   Cursor -- and its derived classes -- are our internal cursors.
+*/
+
+#include "pch.h"
+#include "clientcursor.h"
+#include "introspect.h"
+#include <time.h>
+#include "db.h"
+#include "commands.h"
+#include "repl_block.h"
+#include "../util/processinfo.h"
+#include "../util/timer.h"
+#include "../server.h"
+
+namespace mongo {
+
+    CCById ClientCursor::clientCursorsById;
+    boost::recursive_mutex& ClientCursor::ccmutex( *(new boost::recursive_mutex()) );
+    long long ClientCursor::numberTimedOut = 0;
+
+    void aboutToDeleteForSharding( const Database* db , const DiskLoc& dl ); // from s/d_logic.h
+
+    /*static*/ void ClientCursor::assertNoCursors() {
+        recursive_scoped_lock lock(ccmutex);
+        if( clientCursorsById.size() ) {
+            log() << "ERROR clientcursors exist but should not at this point" << endl;
+            ClientCursor *cc = clientCursorsById.begin()->second;
+            log() << "first one: " << cc->_cursorid << ' ' << cc->_ns << endl;
+            clientCursorsById.clear();
+            assert(false);
+        }
+    }
+
+
+    void ClientCursor::setLastLoc_inlock(DiskLoc L) {
+        assert( _pos != -2 ); // defensive - see ~ClientCursor
+
+        if ( L == _lastLoc )
+            return;
+
+        CCByLoc& bl = byLoc();
+
+        if ( !_lastLoc.isNull() ) {
+            bl.erase( ByLocKey( _lastLoc, _cursorid ) );
+        }
+
+        if ( !L.isNull() )
+            bl[ByLocKey(L,_cursorid)] = this;
+        _lastLoc = L;
+    }
+
+    /* ------------------------------------------- */
+
+    /* must call this when a btree node is updated */
+    //void removedKey(const DiskLoc& btreeLoc, int keyPos) {
+    //}
+
+    // ns is either a full namespace or "dbname." when invalidating for a whole db
+    void ClientCursor::invalidate(const char *ns) {
+        d.dbMutex.assertWriteLocked();
+        int len = strlen(ns);
+        const char* dot = strchr(ns, '.');
+        assert( len > 0 && dot);
+
+        bool isDB = (dot == &ns[len-1]); // first (and only) dot is the last char
+
+        {
+            //cout << "\nTEMP invalidate " << ns << endl;
+            recursive_scoped_lock lock(ccmutex);
+
+            Database *db = cc().database();
+            assert(db);
+            assert( str::startsWith(ns, db->name) );
+
+            for( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); /*++i*/ ) {
+                ClientCursor *cc = i->second;
+
+                ++i; // we may be removing this node
+
+                if( cc->_db != db )
+                    continue;
+
+                if (isDB) {
+                    // already checked that db matched above
+                    dassert( str::startsWith(cc->_ns.c_str(), ns) );
+                    delete cc; //removes self from ccByID
+                }
+                else {
+                    if ( str::equals(cc->_ns.c_str(), ns) )
+                        delete cc; //removes self from ccByID
+                }
+            }
+
+            /*
+            note : we can't iterate byloc because clientcursors may exist with a loc of null in which case
+                   they are not in the map.  perhaps they should not exist though in the future?  something to
+                   change???
+
+            CCByLoc& bl = db->ccByLoc;
+            for ( CCByLoc::iterator i = bl.begin(); i != bl.end(); ++i ) {
+                ClientCursor *cc = i->second;
+                if ( strncmp(ns, cc->ns.c_str(), len) == 0 ) {
+                    assert( cc->_db == db );
+                    toDelete.push_back(i->second);
+                }
+            }*/
+
+            /*cout << "TEMP after invalidate " << endl;
+            for( auto i = clientCursorsById.begin(); i != clientCursorsById.end(); ++i ) {
+                cout << "  " << i->second->ns << endl;
+            }
+            cout << "TEMP after invalidate done" << endl;*/
+        }
+    }
+
+    /* note called outside of locks (other than ccmutex) so care must be exercised */
+    bool ClientCursor::shouldTimeout( unsigned millis ) {
+        _idleAgeMillis += millis;
+        return _idleAgeMillis > 600000 && _pinValue == 0;
+    }
+
+    /* called every 4 seconds.  millis is amount of idle time passed since the last call -- could be zero */
+    void ClientCursor::idleTimeReport(unsigned millis) {
+        bool foundSomeToTimeout = false;
+
+        // two passes so that we don't need to readlock unless we really do some timeouts
+        // we assume here that incrementing _idleAgeMillis outside readlock is ok.
+        {
+            recursive_scoped_lock lock(ccmutex);
+            {
+                unsigned sz = clientCursorsById.size();
+                static time_t last;
+                if( sz >= 100000 ) { 
+                    if( time(0) - last > 300 ) {
+                        last = time(0);
+                        log() << "warning number of open cursors is very large: " << sz << endl;
+                    }
+                }
+            }
+            for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end();  ) {
+                CCById::iterator j = i;
+                i++;
+                if( j->second->shouldTimeout( millis ) ) {
+                    foundSomeToTimeout = true;
+                    break;
+                }
+            }
+        }
+
+        if( foundSomeToTimeout ) {
+            // todo: ideally all readlocks automatically note what we are locking for so this 
+            // can be reported in currentop command. e.g. something like:
+            //   readlock lk("", "timeout cursors");
+            readlock lk("");
+            recursive_scoped_lock lock(ccmutex);
+            for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end();  ) {
+                CCById::iterator j = i;
+                i++;
+                if( j->second->shouldTimeout(0) ) {
+                    numberTimedOut++;
+                    LOG(1) << "killing old cursor " << j->second->_cursorid << ' ' << j->second->_ns
+                           << " idle:" << j->second->idleTime() << "ms\n";
+                    delete j->second;
+                }
+            }
+        }
+    }
+
+    /* must call when a btree bucket going away.
+       note this is potentially slow
+    */
+    void ClientCursor::informAboutToDeleteBucket(const DiskLoc& b) {
+        recursive_scoped_lock lock(ccmutex);
+        Database *db = cc().database();
+        CCByLoc& bl = db->ccByLoc;
+        RARELY if ( bl.size() > 70 ) {
+            log() << "perf warning: byLoc.size=" << bl.size() << " in aboutToDeleteBucket\n";
+        }
+        if( bl.size() == 0 ) { 
+            DEV tlog() << "debug warning: no cursors found in informAboutToDeleteBucket()" << endl;
+        }
+        for ( CCByLoc::iterator i = bl.begin(); i != bl.end(); i++ )
+            i->second->_c->aboutToDeleteBucket(b);
+    }
+    void aboutToDeleteBucket(const DiskLoc& b) {
+        ClientCursor::informAboutToDeleteBucket(b);
+    }
+
+    /* must call this on a delete so we clean up the cursors. */
+    void ClientCursor::aboutToDelete(const DiskLoc& dl) {
+        recursive_scoped_lock lock(ccmutex);
+
+        Database *db = cc().database();
+        assert(db);
+
+        aboutToDeleteForSharding( db , dl );
+
+        CCByLoc& bl = db->ccByLoc;
+        CCByLoc::iterator j = bl.lower_bound(ByLocKey::min(dl));
+        CCByLoc::iterator stop = bl.upper_bound(ByLocKey::max(dl));
+        if ( j == stop )
+            return;
+
+        vector<ClientCursor*> toAdvance;
+
+        while ( 1 ) {
+            toAdvance.push_back(j->second);
+            DEV assert( j->first.loc == dl );
+            ++j;
+            if ( j == stop )
+                break;
+        }
+
+        if( toAdvance.size() >= 3000 ) {
+            log() << "perf warning MPW101: " << toAdvance.size() << " cursors for one diskloc "
+                  << dl.toString()
+                  << ' ' << toAdvance[1000]->_ns
+                  << ' ' << toAdvance[2000]->_ns
+                  << ' ' << toAdvance[1000]->_pinValue
+                  << ' ' << toAdvance[2000]->_pinValue
+                  << ' ' << toAdvance[1000]->_pos
+                  << ' ' << toAdvance[2000]->_pos
+                  << ' ' << toAdvance[1000]->_idleAgeMillis
+                  << ' ' << toAdvance[2000]->_idleAgeMillis
+                  << ' ' << toAdvance[1000]->_doingDeletes
+                  << ' ' << toAdvance[2000]->_doingDeletes
+                  << endl;
+            //wassert( toAdvance.size() < 5000 );
+        }
+
+        for ( vector<ClientCursor*>::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ) {
+            ClientCursor* cc = *i;
+            wassert(cc->_db == db);
+
+            if ( cc->_doingDeletes ) continue;
+
+            Cursor *c = cc->_c.get();
+            if ( c->capped() ) {
+                /* note we cannot advance here. if this condition occurs, writes to the oplog
+                   have "caught" the reader.  skipping ahead, the reader would miss postentially
+                   important data.
+                   */
+                delete cc;
+                continue;
+            }
+
+            c->checkLocation();
+            DiskLoc tmp1 = c->refLoc();
+            if ( tmp1 != dl ) {
+                // This might indicate a failure to call ClientCursor::updateLocation() but it can
+                // also happen during correct operation, see SERVER-2009.
+                problem() << "warning: cursor loc " << tmp1 << " does not match byLoc position " << dl << " !" << endl;
+            }
+            else {
+                c->advance();
+            }
+            while (!c->eof() && c->refLoc() == dl) {
+                /* We don't delete at EOF because we want to return "no more results" rather than "no such cursor".
+                 * The loop is to handle MultiKey indexes where the deleted record is pointed to by multiple adjacent keys.
+                 * In that case we need to advance until we get to the next distinct record or EOF.
+                 * SERVER-4154
+                 */
+                c->advance();
+            }
+            cc->updateLocation();
+        }
+    }
+    void aboutToDelete(const DiskLoc& dl) { ClientCursor::aboutToDelete(dl); }
+
+    ClientCursor::ClientCursor(int queryOptions, const shared_ptr<Cursor>& c, const string& ns, BSONObj query ) :
+        _ns(ns), _db( cc().database() ),
+        _c(c), _pos(0),
+        _query(query),  _queryOptions(queryOptions),
+        _idleAgeMillis(0), _pinValue(0),
+        _doingDeletes(false), _yieldSometimesTracker(128,10) {
+
+        d.dbMutex.assertAtLeastReadLocked();
+
+        assert( _db );
+        assert( str::startsWith(_ns, _db->name) );
+        if( queryOptions & QueryOption_NoCursorTimeout )
+            noTimeout();
+        recursive_scoped_lock lock(ccmutex);
+        _cursorid = allocCursorId_inlock();
+        clientCursorsById.insert( make_pair(_cursorid, this) );
+
+        if ( ! _c->modifiedKeys() ) {
+            // store index information so we can decide if we can
+            // get something out of the index key rather than full object
+
+            int x = 0;
+            BSONObjIterator i( _c->indexKeyPattern() );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                if ( e.isNumber() ) {
+                    // only want basic index fields, not "2d" etc
+                    _indexedFields[e.fieldName()] = x;
+                }
+                x++;
+            }
+        }
+
+    }
+
+
+    ClientCursor::~ClientCursor() {
+        if( _pos == -2 ) {
+            // defensive: destructor called twice
+            wassert(false);
+            return;
+        }
+
+        {
+            recursive_scoped_lock lock(ccmutex);
+            setLastLoc_inlock( DiskLoc() ); // removes us from bylocation multimap
+            clientCursorsById.erase(_cursorid);
+
+            // defensive:
+            (CursorId&)_cursorid = -1;
+            _pos = -2;
+        }
+    }
+
+    bool ClientCursor::getFieldsDotted( const string& name, BSONElementSet &ret, BSONObj& holder ) {
+
+        map<string,int>::const_iterator i = _indexedFields.find( name );
+        if ( i == _indexedFields.end() ) {
+            current().getFieldsDotted( name , ret );
+            return false;
+        }
+
+        int x = i->second;
+
+        holder = currKey();
+        BSONObjIterator it( holder );
+        while ( x && it.more() ) {
+            it.next();
+            x--;
+        }
+        assert( x == 0 );
+        ret.insert( it.next() );
+        return true;
+    }
+
+    BSONElement ClientCursor::getFieldDotted( const string& name , BSONObj& holder , bool * fromKey ) {
+
+        map<string,int>::const_iterator i = _indexedFields.find( name );
+        if ( i == _indexedFields.end() ) {
+            if ( fromKey )
+                *fromKey = false;
+            holder = current();
+            return holder.getFieldDotted( name );
+        }
+        
+        int x = i->second;
+
+        holder = currKey();
+        BSONObjIterator it( holder );
+        while ( x && it.more() ) {
+            it.next();
+            x--;
+        }
+        assert( x == 0 );
+
+        if ( fromKey )
+            *fromKey = true;
+        return it.next();
+    }
+
+    BSONObj ClientCursor::extractFields(const BSONObj &pattern , bool fillWithNull ) {
+        BSONObjBuilder b( pattern.objsize() * 2 );
+
+        BSONObj holder;
+     
+        BSONObjIterator i( pattern ); 
+        while ( i.more() ) {
+            BSONElement key = i.next();
+            BSONElement value = getFieldDotted( key.fieldName() , holder );
+
+            if ( value.type() ) {
+                b.appendAs( value , key.fieldName() );
+                continue;
+            }
+
+            if ( fillWithNull ) 
+                b.appendNull( key.fieldName() );            
+            
+        }
+
+        return b.obj();
+    }
+    
+
+    /* call when cursor's location changes so that we can update the
+       cursorsbylocation map.  if you are locked and internally iterating, only
+       need to call when you are ready to "unlock".
+    */
+    void ClientCursor::updateLocation() {
+        assert( _cursorid );
+        _idleAgeMillis = 0;
+        DiskLoc cl = _c->refLoc();
+        if ( lastLoc() == cl ) {
+            //log() << "info: lastloc==curloc " << ns << '\n';
+        }
+        else {
+            recursive_scoped_lock lock(ccmutex);
+            setLastLoc_inlock(cl);
+        }
+        // may be necessary for MultiCursor even when cl hasn't changed
+        _c->noteLocation();
+    }
+
+    int ClientCursor::suggestYieldMicros() {
+        int writers = 0;
+        int readers = 0;
+
+        int micros = Client::recommendedYieldMicros( &writers , &readers );
+
+        if ( micros > 0 && writers == 0 && d.dbMutex.getState() <= 0 ) {
+            // we have a read lock, and only reads are coming on, so why bother unlocking
+            return 0;
+        }
+
+        wassert( micros < 10000000 );
+        dassert( micros <  1000001 );
+        return micros;
+    }
+    
+    Record* ClientCursor::_recordForYield( ClientCursor::RecordNeeds need ) {
+        if ( need == DontNeed ) {
+            return 0;
+        }
+        else if ( need == MaybeCovered ) {
+            // TODO
+            return 0;
+        }
+        else if ( need == WillNeed ) {
+            // no-op
+        }
+        else {
+            warning() << "don't understand RecordNeeds: " << (int)need << endl;
+            return 0;
+        }
+
+        DiskLoc l = currLoc();
+        if ( l.isNull() )
+            return 0;
+        
+        Record * rec = l.rec();
+        if ( rec->likelyInPhysicalMemory() ) 
+            return 0;
+        
+        return rec;
+    }
+
+    bool ClientCursor::yieldSometimes( RecordNeeds need, bool *yielded ) {
+        if ( yielded ) {
+            *yielded = false;   
+        }
+        if ( ! _yieldSometimesTracker.intervalHasElapsed() ) {
+            Record* rec = _recordForYield( need );
+            if ( rec ) {
+                // yield for page fault
+                if ( yielded ) {
+                    *yielded = true;   
+                }
+                return yield( suggestYieldMicros() , rec );
+            }
+            return true;
+        }
+
+        int micros = suggestYieldMicros();
+        if ( micros > 0 ) {
+            if ( yielded ) {
+                *yielded = true;   
+            }
+            return yield( micros , _recordForYield( need ) );
+        }
+        return true;
+    }
+
+    void ClientCursor::staticYield( int micros , const StringData& ns , Record * rec ) {
+        killCurrentOp.checkForInterrupt( false );
+        {
+            auto_ptr<LockMongoFilesShared> lk;
+            if ( rec ) {
+                // need to lock this else rec->touch won't be safe file could disappear
+                lk.reset( new LockMongoFilesShared() );
+            }
+            
+            dbtempreleasecond unlock;
+            if ( unlock.unlocked() ) {
+                if ( micros == -1 )
+                    micros = Client::recommendedYieldMicros();
+                if ( micros > 0 )
+                    sleepmicros( micros );
+            }
+            else {
+                CurOp * c = cc().curop();
+                while ( c->parent() )
+                    c = c->parent();
+                LOGSOME << "warning ClientCursor::yield can't unlock b/c of recursive lock"
+                          << " ns: " << ns 
+                          << " top: " << c->info()
+                          << endl;
+            }
+
+            if ( rec )
+                rec->touch();
+
+            lk.reset(0); // need to release this before dbtempreleasecond
+        }
+    }
+
+    bool ClientCursor::prepareToYield( YieldData &data ) {
+        if ( ! _c->supportYields() )
+            return false;
+        if ( ! _c->prepareToYield() ) {
+            return false;   
+        }
+        // need to store in case 'this' gets deleted
+        data._id = _cursorid;
+
+        data._doingDeletes = _doingDeletes;
+        _doingDeletes = false;
+
+        updateLocation();
+
+        {
+            /* a quick test that our temprelease is safe.
+             todo: make a YieldingCursor class
+             and then make the following code part of a unit test.
+             */
+            const int test = 0;
+            static bool inEmpty = false;
+            if( test && !inEmpty ) {
+                inEmpty = true;
+                log() << "TEST: manipulate collection during cc:yield" << endl;
+                if( test == 1 )
+                    Helpers::emptyCollection(_ns.c_str());
+                else if( test == 2 ) {
+                    BSONObjBuilder b; string m;
+                    dropCollection(_ns.c_str(), m, b);
+                }
+                else {
+                    dropDatabase(_ns.c_str());
+                }
+            }
+        }
+        return true;
+    }
+
+    bool ClientCursor::recoverFromYield( const YieldData &data ) {
+        ClientCursor *cc = ClientCursor::find( data._id , false );
+        if ( cc == 0 ) {
+            // id was deleted
+            return false;
+        }
+
+        cc->_doingDeletes = data._doingDeletes;
+        cc->_c->recoverFromYield();
+        return true;
+    }
+
+    /** @return true if cursor is still ok */
+    bool ClientCursor::yield( int micros , Record * recordToLoad ) {
+
+        if ( ! _c->supportYields() ) // so me cursors (geo@oct2011) don't support yielding
+            return true;
+
+        YieldData data;
+        prepareToYield( data );
+        staticYield( micros , _ns , recordToLoad );
+        return ClientCursor::recoverFromYield( data );
+    }
+
+    long long ctmLast = 0; // so we don't have to do find() which is a little slow very often.
+    long long ClientCursor::allocCursorId_inlock() {
+        long long ctm = curTimeMillis64();
+        dassert( ctm );
+        long long x;
+        while ( 1 ) {
+            x = (((long long)rand()) << 32);
+            x = x ^ ctm;
+            if ( ctm != ctmLast || ClientCursor::find_inlock(x, false) == 0 )
+                break;
+        }
+        ctmLast = ctm;
+        return x;
+    }
+
+    void ClientCursor::storeOpForSlave( DiskLoc last ) {
+        if ( ! ( _queryOptions & QueryOption_OplogReplay ))
+            return;
+
+        if ( last.isNull() )
+            return;
+
+        BSONElement e = last.obj()["ts"];
+        if ( e.type() == Date || e.type() == Timestamp )
+            _slaveReadTill = e._opTime();
+    }
+
+    void ClientCursor::updateSlaveLocation( CurOp& curop ) {
+        if ( _slaveReadTill.isNull() )
+            return;
+        mongo::updateSlaveLocation( curop , _ns.c_str() , _slaveReadTill );
+    }
+
+
+    void ClientCursor::appendStats( BSONObjBuilder& result ) {
+        recursive_scoped_lock lock(ccmutex);
+        result.appendNumber("totalOpen", clientCursorsById.size() );
+        result.appendNumber("clientCursors_size", (int) numCursors());
+        result.appendNumber("timedOut" , numberTimedOut);
+        unsigned pinned = 0;
+        unsigned notimeout = 0;
+        for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); i++ ) {
+            unsigned p = i->second->_pinValue;
+            if( p >= 100 )
+                pinned++;
+            else if( p > 0 )
+                notimeout++;
+        }
+        if( pinned ) 
+            result.append("pinned", pinned);
+        if( notimeout )
+            result.append("totalNoTimeout", notimeout);
+    }
+
+    // QUESTION: Restrict to the namespace from which this command was issued?
+    // Alternatively, make this command admin-only?
+    class CmdCursorInfo : public Command {
+    public:
+        CmdCursorInfo() : Command( "cursorInfo", true ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual void help( stringstream& help ) const {
+            help << " example: { cursorInfo : 1 }";
+        }
+        virtual LockType locktype() const { return NONE; }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            ClientCursor::appendStats( result );
+            return true;
+        }
+    } cmdCursorInfo;
+
+    struct Mem { 
+        Mem() { res = virt = mapped = 0; }
+        int res;
+        int virt;
+        int mapped;
+        bool grew(const Mem& r) { 
+            return (r.res && (((double)res)/r.res)>1.1 ) ||
+              (r.virt && (((double)virt)/r.virt)>1.1 ) ||
+              (r.mapped && (((double)mapped)/r.mapped)>1.1 );
+        }
+    };
+
+    /** called once a minute from killcursors thread */
+    void sayMemoryStatus() { 
+        static time_t last;
+        static Mem mlast;
+        try {
+            ProcessInfo p;
+            if ( !cmdLine.quiet && p.supported() ) {
+                Mem m;
+                m.res = p.getResidentSize();
+                m.virt = p.getVirtualMemorySize();
+                m.mapped = (int) (MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ));
+                if( time(0)-last >= 300 || m.grew(mlast) ) { 
+                    log() << "mem (MB) res:" << m.res << " virt:" << m.virt << " mapped:" << m.mapped << endl;
+                    if( m.virt - (cmdLine.dur?2:1)*m.mapped > 5000 ) { 
+                        ONCE log() << "warning virtual/mapped memory differential is large. journaling:" << cmdLine.dur << endl;
+                    }
+                    last = time(0);
+                    mlast = m;
+                }
+            }
+        }
+        catch(...) {
+            log() << "ProcessInfo exception" << endl;
+        }
+    }
+
+    /** thread for timing out old cursors */
+    void ClientCursorMonitor::run() {
+        Client::initThread("clientcursormon");
+        Client& client = cc();
+        Timer t;
+        const int Secs = 4;
+        unsigned n = 0;
+        while ( ! inShutdown() ) {
+            ClientCursor::idleTimeReport( t.millisReset() );
+            sleepsecs(Secs);
+            if( ++n % (60/4) == 0 /*once a minute*/ ) { 
+                sayMemoryStatus();
+            }
+        }
+        client.shutdown();
+    }
+
+    void ClientCursor::find( const string& ns , set<CursorId>& all ) {
+        recursive_scoped_lock lock(ccmutex);
+
+        for ( CCById::iterator i=clientCursorsById.begin(); i!=clientCursorsById.end(); ++i ) {
+            if ( i->second->_ns == ns )
+                all.insert( i->first );
+        }
+    }
+
+    int ClientCursor::erase(int n, long long *ids) {
+        int found = 0;
+        for ( int i = 0; i < n; i++ ) {
+            if ( erase(ids[i]) )
+                found++;
+
+            if ( inShutdown() )
+                break;
+        }
+        return found;
+
+    }
+
+    ClientCursorMonitor clientCursorMonitor;
+
+} // namespace mongo
diff --git a/src/mongo/db/clientcursor.h b/src/mongo/db/clientcursor.h
new file mode 100644
index 00000000000..e570820f62c
--- /dev/null
+++ b/src/mongo/db/clientcursor.h
@@ -0,0 +1,430 @@
+/* clientcursor.h */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Cursor -- and its derived classes -- are our internal cursors.
+
+   ClientCursor is a wrapper that represents a cursorid from our database
+   application's perspective.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "cursor.h"
+#include "jsobj.h"
+#include "../util/net/message.h"
+#include "../util/net/listen.h"
+#include "../util/background.h"
+#include "diskloc.h"
+#include "dbhelpers.h"
+#include "matcher.h"
+#include "../client/dbclient.h"
+#include "projection.h"
+#include "s/d_chunk_manager.h"
+
+namespace mongo {
+
+    typedef long long CursorId; /* passed to the client so it can send back on getMore */
+    class Cursor; /* internal server cursor base class */
+    class ClientCursor;
+    class ParsedQuery;
+
+    struct ByLocKey {
+
+        ByLocKey( const DiskLoc & l , const CursorId& i ) : loc(l), id(i) {}
+
+        static ByLocKey min( const DiskLoc& l ) { return ByLocKey( l , numeric_limits<long long>::min() ); }
+        static ByLocKey max( const DiskLoc& l ) { return ByLocKey( l , numeric_limits<long long>::max() ); }
+
+        bool operator<( const ByLocKey &other ) const {
+            int x = loc.compare( other.loc );
+            if ( x )
+                return x < 0;
+            return id < other.id;
+        }
+
+        DiskLoc loc;
+        CursorId id;
+
+    };
+
+    /* todo: make this map be per connection.  this will prevent cursor hijacking security attacks perhaps.
+     *       ERH: 9/2010 this may not work since some drivers send getMore over a different connection
+    */
+    typedef map<CursorId, ClientCursor*> CCById;
+    typedef map<ByLocKey, ClientCursor*> CCByLoc;
+
+    extern BSONObj id_obj;
+
+    class ClientCursor {
+        friend class CmdCursorInfo;
+    public:
+        static void assertNoCursors();
+
+        /* use this to assure we don't in the background time out cursor while it is under use.
+           if you are using noTimeout() already, there is no risk anyway.
+           Further, this mechanism guards against two getMore requests on the same cursor executing
+           at the same time - which might be bad.  That should never happen, but if a client driver
+           had a bug, it could (or perhaps some sort of attack situation).
+        */
+        class Pointer : boost::noncopyable {
+            ClientCursor *_c;
+        public:
+            ClientCursor * c() { return _c; }
+            void release() {
+                if( _c ) {
+                    assert( _c->_pinValue >= 100 );
+                    _c->_pinValue -= 100;
+                    _c = 0;
+                }
+            }
+            /**
+             * call this if during a yield, the cursor got deleted
+             * if so, we don't want to use the point address
+             */
+            void deleted() {
+                _c = 0;
+            }
+            ~Pointer() { release(); }
+            Pointer(long long cursorid) {
+                recursive_scoped_lock lock(ccmutex);
+                _c = ClientCursor::find_inlock(cursorid, true);
+                if( _c ) {
+                    if( _c->_pinValue >= 100 ) {
+                        _c = 0;
+                        uasserted(12051, "clientcursor already in use? driver problem?");
+                    }
+                    _c->_pinValue += 100;
+                }
+            }
+        };
+
+        // This object assures safe and reliable cleanup of the ClientCursor.
+        // The implementation assumes that there will be no duplicate ids among cursors
+        // (which is assured if cursors must last longer than 1 second).
+        class CleanupPointer : boost::noncopyable {
+        public:
+            CleanupPointer() : _c( 0 ), _id( -1 ) {}
+            void reset( ClientCursor *c = 0 ) {
+                if ( c == _c )
+                    return;
+                if ( _c ) {
+                    // be careful in case cursor was deleted by someone else
+                    ClientCursor::erase( _id );
+                }
+                if ( c ) {
+                    _c = c;
+                    _id = c->_cursorid;
+                }
+                else {
+                    _c = 0;
+                    _id = -1;
+                }
+            }
+            ~CleanupPointer() {
+                DESTRUCTOR_GUARD ( reset(); );
+            }
+            operator bool() { return _c; }
+            ClientCursor * operator-> () { return _c; }
+        private:
+            ClientCursor *_c;
+            CursorId _id;
+        };
+
+        ClientCursor(int queryOptions, const shared_ptr<Cursor>& c, const string& ns, BSONObj query = BSONObj() );
+
+        ~ClientCursor();
+
+        // ***************  basic accessors *******************
+
+        CursorId cursorid() const { return _cursorid; }
+        string ns() const { return _ns; }
+        Database * db() const { return _db; }
+        const BSONObj& query() const { return _query; }
+        int queryOptions() const { return _queryOptions; }
+
+        DiskLoc lastLoc() const { return _lastLoc; }
+
+        /* Get rid of cursors for namespaces 'ns'. When dropping a db, ns is "dbname."
+           Used by drop, dropIndexes, dropDatabase.
+        */
+        static void invalidate(const char *ns);
+
+        /**
+         * @param microsToSleep -1 : ask client
+         *                     >=0 : sleep for that amount
+         * @param recordToLoad after yielding lock, load this record with only mmutex
+         * do a dbtemprelease
+         * note: caller should check matcher.docMatcher().atomic() first and not yield if atomic -
+         *       we don't do herein as this->matcher (above) is only initialized for true queries/getmore.
+         *       (ie not set for remote/update)
+         * @return if the cursor is still valid.
+         *         if false is returned, then this ClientCursor should be considered deleted -
+         *         in fact, the whole database could be gone.
+         */
+        bool yield( int microsToSleep = -1 , Record * recordToLoad = 0 );
+
+        enum RecordNeeds {
+            DontNeed = -1 , MaybeCovered = 0 , WillNeed = 100
+        };
+            
+        /**
+         * @param needRecord whether or not the next record has to be read from disk for sure
+         *                   if this is true, will yield of next record isn't in memory
+         * @param yielded true if a yield occurred, and potentially if a yield did not occur
+         * @return same as yield()
+         */
+        bool yieldSometimes( RecordNeeds need, bool *yielded = 0 );
+
+        static int suggestYieldMicros();
+        static void staticYield( int micros , const StringData& ns , Record * rec );
+
+        struct YieldData { CursorId _id; bool _doingDeletes; };
+        bool prepareToYield( YieldData &data );
+        static bool recoverFromYield( const YieldData &data );
+
+        struct YieldLock : boost::noncopyable {
+            explicit YieldLock( ptr<ClientCursor> cc )
+                : _canYield(cc->_c->supportYields()) {
+                if ( _canYield ) {
+                    cc->prepareToYield( _data );
+                    _unlock.reset(new dbtempreleasecond());
+                }
+            }
+            ~YieldLock() {
+                if ( _unlock ) {
+                    log( LL_WARNING ) << "ClientCursor::YieldLock not closed properly" << endl;
+                    relock();
+                }
+            }
+            bool stillOk() {
+                if ( ! _canYield )
+                    return true;
+                relock();
+                return ClientCursor::recoverFromYield( _data );
+            }
+            void relock() {
+                _unlock.reset();
+            }
+        private:
+            const bool _canYield;
+            YieldData _data;
+            scoped_ptr<dbtempreleasecond> _unlock;
+        };
+
+        // --- some pass through helpers for Cursor ---
+
+        Cursor* c() const { return _c.get(); }
+        int pos() const { return _pos; }
+
+        void incPos( int n ) { _pos += n; } // TODO: this is bad
+        void setPos( int n ) { _pos = n; } // TODO : this is bad too
+
+        BSONObj indexKeyPattern() { return _c->indexKeyPattern();  }
+        bool modifiedKeys() const { return _c->modifiedKeys(); }
+        bool isMultiKey() const { return _c->isMultiKey(); }
+
+        bool ok() { return _c->ok(); }
+        bool advance() { return _c->advance(); }
+        BSONObj current() { return _c->current(); }
+        DiskLoc currLoc() { return _c->currLoc(); }
+        BSONObj currKey() const { return _c->currKey(); }
+
+        /**
+         * same as BSONObj::getFieldsDotted
+         * if it can be retrieved from key, it is
+         * @param holder keeps the currKey in scope by keeping a reference to it here. generally you'll want 
+         *        holder and ret to destruct about the same time.
+         * @return if this was retrieved from key
+         */
+        bool getFieldsDotted( const string& name, BSONElementSet &ret, BSONObj& holder );
+
+        /**
+         * same as BSONObj::getFieldDotted
+         * if it can be retrieved from key, it is
+         * @return if this was retrieved from key
+         */
+        BSONElement getFieldDotted( const string& name , BSONObj& holder , bool * fromKey = 0 ) ;
+        
+        /** extract items from object which match a pattern object.
+         * e.g., if pattern is { x : 1, y : 1 }, builds an object with
+         * x and y elements of this object, if they are present.
+         * returns elements with original field names
+         * NOTE: copied from BSONObj::extractFields
+        */
+        BSONObj extractFields(const BSONObj &pattern , bool fillWithNull = false) ;
+        
+        bool currentIsDup() { return _c->getsetdup( _c->currLoc() ); }
+
+        bool currentMatches() {
+            if ( ! _c->matcher() )
+                return true;
+            return _c->matcher()->matchesCurrent( _c.get() );
+        }
+
+        void setChunkManager( ShardChunkManagerPtr manager ){ _chunkManager = manager; }
+        ShardChunkManagerPtr getChunkManager(){ return _chunkManager; }
+
+    private:
+        void setLastLoc_inlock(DiskLoc);
+
+        static ClientCursor* find_inlock(CursorId id, bool warn = true) {
+            CCById::iterator it = clientCursorsById.find(id);
+            if ( it == clientCursorsById.end() ) {
+                if ( warn )
+                    OCCASIONALLY out() << "ClientCursor::find(): cursor not found in map " << id << " (ok after a drop)\n";
+                return 0;
+            }
+            return it->second;
+        }
+    public:
+        static ClientCursor* find(CursorId id, bool warn = true) {
+            recursive_scoped_lock lock(ccmutex);
+            ClientCursor *c = find_inlock(id, warn);
+            // if this asserts, your code was not thread safe - you either need to set no timeout
+            // for the cursor or keep a ClientCursor::Pointer in scope for it.
+            massert( 12521, "internal error: use of an unlocked ClientCursor", c == 0 || c->_pinValue );
+            return c;
+        }
+
+        static bool erase(CursorId id) {
+            recursive_scoped_lock lock(ccmutex);
+            ClientCursor *cc = find_inlock(id);
+            if ( cc ) {
+                assert( cc->_pinValue < 100 ); // you can't still have an active ClientCursor::Pointer
+                delete cc;
+                return true;
+            }
+            return false;
+        }
+
+        /**
+         * @return number of cursors found
+         */
+        static int erase( int n , long long * ids );
+
+        /* call when cursor's location changes so that we can update the
+           cursorsbylocation map.  if you are locked and internally iterating, only
+           need to call when you are ready to "unlock".
+           */
+        void updateLocation();
+
+        void mayUpgradeStorage() {
+            /* if ( !ids_.get() )
+                return;
+            stringstream ss;
+            ss << ns << "." << cursorid;
+            ids_->mayUpgradeStorage( ss.str() );*/
+        }
+
+        /**
+         * @param millis amount of idle passed time since last call
+         */
+        bool shouldTimeout( unsigned millis );
+
+        void storeOpForSlave( DiskLoc last );
+        void updateSlaveLocation( CurOp& curop );
+
+        unsigned idleTime() const { return _idleAgeMillis; }
+
+        void setDoingDeletes( bool doingDeletes ) {_doingDeletes = doingDeletes; }
+
+        void slaveReadTill( const OpTime& t ) { _slaveReadTill = t; }
+
+    public: // static methods
+
+        static void idleTimeReport(unsigned millis);
+
+        static void appendStats( BSONObjBuilder& result );
+        static unsigned numCursors() { return clientCursorsById.size(); }
+        static void informAboutToDeleteBucket(const DiskLoc& b);
+        static void aboutToDelete(const DiskLoc& dl);
+        static void find( const string& ns , set<CursorId>& all );
+
+
+    private: // methods
+
+        // cursors normally timeout after an inactivy period to prevent excess memory use
+        // setting this prevents timeout of the cursor in question.
+        void noTimeout() { _pinValue++; }
+
+        CCByLoc& byLoc() { return _db->ccByLoc; }
+        
+        Record* _recordForYield( RecordNeeds need );
+
+    private:
+
+        CursorId _cursorid;
+
+        const string _ns;
+        Database * _db;
+
+        const shared_ptr<Cursor> _c;
+        map<string,int> _indexedFields;  // map from indexed field to offset in key object
+        int _pos;                        // # objects into the cursor so far
+
+        const BSONObj _query;            // used for logging diags only; optional in constructor
+        int _queryOptions;        // see enum QueryOptions dbclient.h
+
+        OpTime _slaveReadTill;
+
+        DiskLoc _lastLoc;                        // use getter and setter not this (important)
+        unsigned _idleAgeMillis;                 // how long has the cursor been around, relative to server idle time
+
+        /* 0 = normal
+           1 = no timeout allowed
+           100 = in use (pinned) -- see Pointer class
+        */
+        unsigned _pinValue;
+
+        bool _doingDeletes; // when true we are the delete and aboutToDelete shouldn't manipulate us
+        ElapsedTracker _yieldSometimesTracker;
+
+        ShardChunkManagerPtr _chunkManager;
+
+    public:
+        shared_ptr<ParsedQuery> pq;
+        shared_ptr<Projection> fields; // which fields query wants returned
+        Message originalMessage; // this is effectively an auto ptr for data the matcher points to
+
+
+
+    private: // static members
+
+        static CCById clientCursorsById;
+        static long long numberTimedOut;
+        static boost::recursive_mutex& ccmutex;   // must use this for all statics above!
+        static CursorId allocCursorId_inlock();
+
+    };
+
+    class ClientCursorMonitor : public BackgroundJob {
+    public:
+        string name() const { return "ClientCursorMonitor"; }
+        void run();
+    };
+
+} // namespace mongo
+
+// ClientCursor should only be used with auto_ptr because it needs to be
+// release()ed after a yield if stillOk() returns false and these pointer types
+// do not support releasing. This will prevent them from being used accidentally
+namespace boost{
+    template<> class scoped_ptr<mongo::ClientCursor> {};
+    template<> class shared_ptr<mongo::ClientCursor> {};
+}
diff --git a/src/mongo/db/cloner.cpp b/src/mongo/db/cloner.cpp
new file mode 100644
index 00000000000..e35ae95052d
--- /dev/null
+++ b/src/mongo/db/cloner.cpp
@@ -0,0 +1,763 @@
+// cloner.cpp - copy a database (export/import basically)
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "cloner.h"
+#include "pdfile.h"
+#include "../client/dbclient.h"
+#include "../bson/util/builder.h"
+#include "jsobj.h"
+#include "ops/query.h"
+#include "commands.h"
+#include "db.h"
+#include "instance.h"
+#include "repl.h"
+
+namespace mongo {
+
+    BSONElement getErrField(const BSONObj& o);
+
+    void ensureHaveIdIndex(const char *ns);
+
+    bool replAuthenticate(DBClientBase *);
+
+    /** Selectively release the mutex based on a parameter. */
+    class dbtempreleaseif {
+    public:
+        dbtempreleaseif( bool release ) : _impl( release ? new dbtemprelease() : 0 ) {}
+    private:
+        shared_ptr< dbtemprelease > _impl;
+    };
+    
+    void mayInterrupt( bool mayBeInterrupted ) {
+     	if ( mayBeInterrupted ) {
+         	killCurrentOp.checkForInterrupt( false );   
+        }
+    }
+    
+    class Cloner: boost::noncopyable {
+        auto_ptr< DBClientWithCommands > conn;
+        void copy(const char *from_ns, const char *to_ns, bool isindex, bool logForRepl,
+                  bool masterSameProcess, bool slaveOk, bool mayYield, bool mayBeInterrupted, Query q = Query());
+        struct Fun;
+    public:
+        Cloner() { }
+
+        /* slaveOk     - if true it is ok if the source of the data is !ismaster.
+           useReplAuth - use the credentials we normally use as a replication slave for the cloning
+           snapshot    - use $snapshot mode for copying collections.  note this should not be used when it isn't required, as it will be slower.
+                         for example repairDatabase need not use it.
+        */
+        void setConnection( DBClientWithCommands *c ) { conn.reset( c ); }
+
+        /** copy the entire database */
+        bool go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode = 0);
+
+        bool copyCollection( const string& ns , const BSONObj& query , string& errmsg , bool mayYield, bool mayBeInterrupted, bool copyIndexes = true, bool logForRepl = true );
+    };
+
+    /* for index info object:
+         { "name" : "name_1" , "ns" : "foo.index3" , "key" :  { "name" : 1.0 } }
+       we need to fix up the value in the "ns" parameter so that the name prefix is correct on a
+       copy to a new name.
+    */
+    BSONObj fixindex(BSONObj o) {
+        BSONObjBuilder b;
+        BSONObjIterator i(o);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+
+            // for now, skip the "v" field so that v:0 indexes will be upgraded to v:1
+            if ( string("v") == e.fieldName() ) {
+                continue;
+            }
+
+            if ( string("ns") == e.fieldName() ) {
+                uassert( 10024 , "bad ns field for index during dbcopy", e.type() == String);
+                const char *p = strchr(e.valuestr(), '.');
+                uassert( 10025 , "bad ns field for index during dbcopy [2]", p);
+                string newname = cc().database()->name + p;
+                b.append("ns", newname);
+            }
+            else
+                b.append(e);
+        }
+        BSONObj res= b.obj();
+
+        /*    if( mod ) {
+            out() << "before: " << o.toString() << endl;
+            o.dump();
+            out() << "after:  " << res.toString() << endl;
+            res.dump();
+            }*/
+
+        return res;
+    }
+
+    struct Cloner::Fun {
+        Fun() : lastLog(0) { }
+        time_t lastLog;
+        void operator()( DBClientCursorBatchIterator &i ) {
+            mongolock l( true );
+            if ( context ) {
+                context->relocked();
+            }
+
+            while( i.moreInCurrentBatch() ) {
+                if ( n % 128 == 127 /*yield some*/ ) {
+                    time_t now = time(0);
+                    if( now - lastLog >= 60 ) { 
+                        // report progress
+                        if( lastLog )
+                            log() << "clone " << to_collection << ' ' << n << endl;
+                        lastLog = now;
+                    }
+                    mayInterrupt( _mayBeInterrupted );
+                    dbtempreleaseif t( _mayYield );
+                }
+
+                BSONObj tmp = i.nextSafe();
+
+                /* assure object is valid.  note this will slow us down a little. */
+                if ( !tmp.valid() ) {
+                    stringstream ss;
+                    ss << "Cloner: skipping corrupt object from " << from_collection;
+                    BSONElement e = tmp.firstElement();
+                    try {
+                        e.validate();
+                        ss << " firstElement: " << e;
+                    }
+                    catch( ... ) {
+                        ss << " firstElement corrupt";
+                    }
+                    out() << ss.str() << endl;
+                    continue;
+                }
+
+                ++n;
+
+                BSONObj js = tmp;
+                if ( isindex ) {
+                    assert( strstr(from_collection, "system.indexes") );
+                    js = fixindex(tmp);
+                    storedForLater->push_back( js.getOwned() );
+                    continue;
+                }
+
+                try {
+                    theDataFileMgr.insertWithObjMod(to_collection, js);
+                    if ( logForRepl )
+                        logOp("i", to_collection, js);
+
+                    getDur().commitIfNeeded();
+                }
+                catch( UserException& e ) {
+                    log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n';
+                }
+
+                RARELY if ( time( 0 ) - saveLast > 60 ) {
+                    log() << n << " objects cloned so far from collection " << from_collection << endl;
+                    saveLast = time( 0 );
+                }
+            }
+        }
+        int n;
+        bool isindex;
+        const char *from_collection;
+        const char *to_collection;
+        time_t saveLast;
+        list<BSONObj> *storedForLater;
+        bool logForRepl;
+        Client::Context *context;
+        bool _mayYield;
+        bool _mayBeInterrupted;
+    };
+
+    /* copy the specified collection
+       isindex - if true, this is system.indexes collection, in which we do some transformation when copying.
+    */
+    void Cloner::copy(const char *from_collection, const char *to_collection, bool isindex, bool logForRepl, bool masterSameProcess, bool slaveOk, bool mayYield, bool mayBeInterrupted, Query query) {
+        list<BSONObj> storedForLater;
+
+        Fun f;
+        f.n = 0;
+        f.isindex = isindex;
+        f.from_collection = from_collection;
+        f.to_collection = to_collection;
+        f.saveLast = time( 0 );
+        f.storedForLater = &storedForLater;
+        f.logForRepl = logForRepl;
+        f._mayYield = mayYield;
+        f._mayBeInterrupted = mayBeInterrupted;
+
+        int options = QueryOption_NoCursorTimeout | ( slaveOk ? QueryOption_SlaveOk : 0 );
+        {
+            f.context = cc().getContext();
+            mayInterrupt( mayBeInterrupted );
+            dbtempreleaseif r( mayYield );
+            DBClientConnection *remote = dynamic_cast< DBClientConnection* >( conn.get() );
+            if ( remote ) {
+                remote->query( boost::function<void(DBClientCursorBatchIterator &)>( f ), from_collection, query, 0, options );
+            }
+            else {
+                // there is no exhaust mode for direct client, so we have this hack
+                auto_ptr<DBClientCursor> c = conn->query( from_collection, query, 0, 0, 0, options );
+                assert( c.get() );
+                while( c->more() ) {
+                    DBClientCursorBatchIterator i( *c );
+                    f( i );
+                }
+            }
+        }
+
+        if ( storedForLater.size() ) {
+            for ( list<BSONObj>::iterator i = storedForLater.begin(); i!=storedForLater.end(); i++ ) {
+                BSONObj js = *i;
+                try {
+                    theDataFileMgr.insertWithObjMod(to_collection, js);
+                    if ( logForRepl )
+                        logOp("i", to_collection, js);
+
+                    getDur().commitIfNeeded();
+                }
+                catch( UserException& e ) {
+                    log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n';
+                }
+            }
+        }
+    }
+
+    bool copyCollectionFromRemote(const string& host, const string& ns, string& errmsg) {
+        Cloner c;
+
+        DBClientConnection *conn = new DBClientConnection();
+        // cloner owns conn in auto_ptr
+        c.setConnection(conn);
+        uassert(15908, errmsg, conn->connect(host, errmsg) && replAuthenticate(conn));
+
+        return c.copyCollection(ns, BSONObj(), errmsg, true, false, /*copyIndexes*/ true, false);
+    }
+
+    bool Cloner::copyCollection( const string& ns, const BSONObj& query, string& errmsg,
+                                 bool mayYield, bool mayBeInterrupted, bool copyIndexes, bool logForRepl ) {
+
+        writelock lk(ns); // TODO: make this lower down
+        Client::Context ctx(ns);
+
+        {
+            // config
+            string temp = ctx.db()->name + ".system.namespaces";
+            BSONObj config = conn->findOne( temp , BSON( "name" << ns ) );
+            if ( config["options"].isABSONObj() )
+                if ( ! userCreateNS( ns.c_str() , config["options"].Obj() , errmsg, logForRepl , 0 ) )
+                    return false;
+        }
+
+        {
+            // main data
+            copy( ns.c_str() , ns.c_str() , /*isindex*/false , logForRepl , false , true , mayYield, mayBeInterrupted, Query(query).snapshot() );
+        }
+
+        /* TODO : copyIndexes bool does not seem to be implemented! */
+        if( !copyIndexes ) {
+            log() << "ERROR copy collection copyIndexes not implemented? " << ns << endl;
+        }
+
+        {
+            // indexes
+            string temp = ctx.db()->name + ".system.indexes";
+            copy( temp.c_str() , temp.c_str() , /*isindex*/true , logForRepl , false , true , mayYield, mayBeInterrupted, BSON( "ns" << ns ) );
+        }
+        getDur().commitIfNeeded();
+        return true;
+    }
+
+    extern bool inDBRepair;
+    void ensureIdIndexForNewNs(const char *ns);
+
+    bool Cloner::go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode) {
+        if ( errCode ) {
+            *errCode = 0;
+        }
+        massert( 10289 ,  "useReplAuth is not written to replication log", !useReplAuth || !logForRepl );
+
+        string todb = cc().database()->name;
+        stringstream a,b;
+        a << "localhost:" << cmdLine.port;
+        b << "127.0.0.1:" << cmdLine.port;
+        bool masterSameProcess = ( a.str() == masterHost || b.str() == masterHost );
+        if ( masterSameProcess ) {
+            if ( fromdb == todb && cc().database()->path == dbpath ) {
+                // guard against an "infinite" loop
+                /* if you are replicating, the local.sources config may be wrong if you get this */
+                errmsg = "can't clone from self (localhost).";
+                return false;
+            }
+        }
+        /* todo: we can put these releases inside dbclient or a dbclient specialization.
+           or just wait until we get rid of global lock anyway.
+           */
+        string ns = fromdb + ".system.namespaces";
+        list<BSONObj> toClone;
+        {
+            mayInterrupt( mayBeInterrupted );
+            dbtempreleaseif r( mayYield );
+
+            // just using exhaust for collection copying right now
+            auto_ptr<DBClientCursor> c;
+            {
+                if ( conn.get() ) {
+                    // nothing to do
+                }
+                else if ( !masterSameProcess ) {
+                    ConnectionString cs = ConnectionString::parse( masterHost, errmsg );
+                    auto_ptr<DBClientBase> con( cs.connect( errmsg ));
+                    if ( !con.get() )
+                        return false;
+                    if( !replAuthenticate(con.get()) )
+                        return false;
+
+                    conn = con;
+                }
+                else {
+                    conn.reset( new DBDirectClient() );
+                }
+                // todo: if snapshot (bool param to this func) is true, we need to snapshot this query?
+                //       only would be relevant if a thousands of collections -- maybe even then it is hard
+                //       to exceed a single cursor batch.
+                //       for repl it is probably ok as we apply oplog section after the clone (i.e. repl 
+                //       doesnt not use snapshot=true).
+                c = conn->query( ns.c_str(), BSONObj(), 0, 0, 0, slaveOk ? QueryOption_SlaveOk : 0 );
+            }
+
+            if ( c.get() == 0 ) {
+                errmsg = "query failed " + ns;
+                return false;
+            }
+            
+            if ( c->more() ) {
+                BSONObj first = c->next();
+                if( !getErrField(first).eoo() ) {
+                    if ( errCode ) {
+                        *errCode = first.getIntField("code");
+                    }
+                    errmsg = "query failed " + ns;
+                    return false;
+                }
+                c->putBack( first );
+            }
+
+            while ( c->more() ) {
+                BSONObj collection = c->next();
+
+                log(2) << "\t cloner got " << collection << endl;
+
+                BSONElement e = collection.getField("name");
+                if ( e.eoo() ) {
+                    string s = "bad system.namespaces object " + collection.toString();
+                    massert( 10290 , s.c_str(), false);
+                }
+                assert( !e.eoo() );
+                assert( e.type() == String );
+                const char *from_name = e.valuestr();
+
+                if( strstr(from_name, ".system.") ) {
+                    /* system.users and s.js is cloned -- but nothing else from system.
+                     * system.indexes is handled specially at the end*/
+                    if( legalClientSystemNS( from_name , true ) == 0 ) {
+                        log(2) << "\t\t not cloning because system collection" << endl;
+                        continue;
+                    }
+                }
+                if( ! NamespaceString::normal( from_name ) ) {
+                    log(2) << "\t\t not cloning because has $ " << endl;
+                    continue;
+                }
+                toClone.push_back( collection.getOwned() );
+            }
+        }
+
+        for ( list<BSONObj>::iterator i=toClone.begin(); i != toClone.end(); i++ ) {
+            {
+                mayInterrupt( mayBeInterrupted );
+                dbtempreleaseif r( mayYield );
+            }
+            BSONObj collection = *i;
+            log(2) << "  really will clone: " << collection << endl;
+            const char * from_name = collection["name"].valuestr();
+            BSONObj options = collection.getObjectField("options");
+
+            /* change name "<fromdb>.collection" -> <todb>.collection */
+            const char *p = strchr(from_name, '.');
+            assert(p);
+            string to_name = todb + p;
+
+            bool wantIdIndex = false;
+            {
+                string err;
+                const char *toname = to_name.c_str();
+                /* we defer building id index for performance - building it in batch is much faster */
+                userCreateNS(toname, options, err, logForRepl, &wantIdIndex);
+            }
+            log(1) << "\t\t cloning " << from_name << " -> " << to_name << endl;
+            Query q;
+            if( snapshot )
+                q.snapshot();
+            copy(from_name, to_name.c_str(), false, logForRepl, masterSameProcess, slaveOk, mayYield, mayBeInterrupted, q);
+
+            if( wantIdIndex ) {
+                /* we need dropDups to be true as we didn't do a true snapshot and this is before applying oplog operations
+                   that occur during the initial sync.  inDBRepair makes dropDups be true.
+                   */
+                bool old = inDBRepair;
+                try {
+                    inDBRepair = true;
+                    ensureIdIndexForNewNs(to_name.c_str());
+                    inDBRepair = old;
+                }
+                catch(...) {
+                    inDBRepair = old;
+                    throw;
+                }
+            }
+        }
+
+        // now build the indexes
+
+        string system_indexes_from = fromdb + ".system.indexes";
+        string system_indexes_to = todb + ".system.indexes";
+        /* [dm]: is the ID index sometimes not called "_id_"?  There is other code in the system that looks for a "_id" prefix
+                 rather than this exact value.  we should standardize.  OR, remove names - which is in the bugdb.  Anyway, this
+                 is dubious here at the moment.
+        */
+        // won't need a snapshot of the query of system.indexes as there can never be very many.
+        copy(system_indexes_from.c_str(), system_indexes_to.c_str(), true, logForRepl, masterSameProcess, slaveOk, mayYield, mayBeInterrupted, BSON( "name" << NE << "_id_" ) );
+
+        return true;
+    }
+
+    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication,
+                   bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode) {
+        Cloner c;
+        return c.go(masterHost, errmsg, fromdb, logForReplication, slaveOk, useReplAuth, snapshot, mayYield, mayBeInterrupted, errCode);
+    }
+
+    /* Usage:
+       mydb.$cmd.findOne( { clone: "fromhost" } );
+    */
+    class CmdClone : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream &help ) const {
+            help << "clone this database from an instance of the db on another host\n";
+            help << "{ clone : \"host13\" }";
+        }
+        CmdClone() : Command("clone") { }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string from = cmdObj.getStringField("clone");
+            if ( from.empty() )
+                return false;
+            /* replication note: we must logOp() not the command, but the cloned data -- if the slave
+               were to clone it would get a different point-in-time and not match.
+               */
+            return cloneFrom(from.c_str(), errmsg, dbname,
+                             /*logForReplication=*/!fromRepl, /*slaveOk*/false, /*usereplauth*/false, /*snapshot*/true, /*mayYield*/true, /*mayBeInterrupted*/false);
+        }
+    } cmdclone;
+
+    class CmdCloneCollection : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual LockType locktype() const { return NONE; }
+        CmdCloneCollection() : Command("cloneCollection") { }
+        virtual void help( stringstream &help ) const {
+            help << "{ cloneCollection: <namespace>, from: <host> [,query: <query_filter>] [,copyIndexes:<bool>] }"
+                 "\nCopies a collection from one server to another. Do not use on a single server as the destination "
+                 "is placed at the same db.collection (namespace) as the source.\n"
+                 "Warning: the local copy of 'ns' is emptied before the copying begins. Any existing data will be lost there."
+                 ;
+        }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string fromhost = cmdObj.getStringField("from");
+            if ( fromhost.empty() ) {
+                errmsg = "missing 'from' parameter";
+                return false;
+            }
+            {
+                HostAndPort h(fromhost);
+                if( h.isSelf() ) {
+                    errmsg = "can't cloneCollection from self";
+                    return false;
+                }
+            }
+            string collection = cmdObj.getStringField("cloneCollection");
+            if ( collection.empty() ) {
+                errmsg = "bad 'cloneCollection' value";
+                return false;
+            }
+            BSONObj query = cmdObj.getObjectField("query");
+            if ( query.isEmpty() )
+                query = BSONObj();
+
+            BSONElement copyIndexesSpec = cmdObj.getField("copyindexes");
+            bool copyIndexes = copyIndexesSpec.isBoolean() ? copyIndexesSpec.boolean() : true;
+
+            log() << "cloneCollection.  db:" << dbname << " collection:" << collection << " from: " << fromhost
+                  << " query: " << query << " " << ( copyIndexes ? "" : ", not copying indexes" ) << endl;
+
+            Cloner c;
+            auto_ptr<DBClientConnection> myconn;
+            myconn.reset( new DBClientConnection() );
+            if ( ! myconn->connect( fromhost , errmsg ) )
+                return false;
+
+            c.setConnection( myconn.release() );
+
+            return c.copyCollection( collection , query, errmsg , true, false, copyIndexes );
+        }
+    } cmdclonecollection;
+
+
+    thread_specific_ptr< DBClientConnection > authConn_;
+    /* Usage:
+     admindb.$cmd.findOne( { copydbgetnonce: 1, fromhost: <hostname> } );
+     */
+    class CmdCopyDbGetNonce : public Command {
+    public:
+        CmdCopyDbGetNonce() : Command("copydbgetnonce") { }
+        virtual bool adminOnly() const {
+            return true;
+        }
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream &help ) const {
+            help << "get a nonce for subsequent copy db request from secure server\n";
+            help << "usage: {copydbgetnonce: 1, fromhost: <hostname>}";
+        }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string fromhost = cmdObj.getStringField("fromhost");
+            if ( fromhost.empty() ) {
+                /* copy from self */
+                stringstream ss;
+                ss << "localhost:" << cmdLine.port;
+                fromhost = ss.str();
+            }
+            authConn_.reset( new DBClientConnection() );
+            BSONObj ret;
+            {
+                dbtemprelease t;
+                if ( !authConn_->connect( fromhost, errmsg ) )
+                    return false;
+                if( !authConn_->runCommand( "admin", BSON( "getnonce" << 1 ), ret ) ) {
+                    errmsg = "couldn't get nonce " + ret.toString();
+                    return false;
+                }
+            }
+            result.appendElements( ret );
+            return true;
+        }
+    } cmdcopydbgetnonce;
+
+    /* Usage:
+       admindb.$cmd.findOne( { copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, username: <username>, nonce: <nonce>, key: <key>] } );
+    */
+    class CmdCopyDb : public Command {
+    public:
+        CmdCopyDb() : Command("copydb") { }
+        virtual bool adminOnly() const {
+            return true;
+        }
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream &help ) const {
+            help << "copy a database from another host to this host\n";
+            help << "usage: {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, slaveOk: <bool>, username: <username>, nonce: <nonce>, key: <key>]}";
+        }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            bool slaveOk = cmdObj["slaveOk"].trueValue();
+            string fromhost = cmdObj.getStringField("fromhost");
+            if ( fromhost.empty() ) {
+                /* copy from self */
+                stringstream ss;
+                ss << "localhost:" << cmdLine.port;
+                fromhost = ss.str();
+            }
+            string fromdb = cmdObj.getStringField("fromdb");
+            string todb = cmdObj.getStringField("todb");
+            if ( fromhost.empty() || todb.empty() || fromdb.empty() ) {
+                errmsg = "parms missing - {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>}";
+                return false;
+            }
+            Cloner c;
+            string username = cmdObj.getStringField( "username" );
+            string nonce = cmdObj.getStringField( "nonce" );
+            string key = cmdObj.getStringField( "key" );
+            if ( !username.empty() && !nonce.empty() && !key.empty() ) {
+                uassert( 13008, "must call copydbgetnonce first", authConn_.get() );
+                BSONObj ret;
+                {
+                    dbtemprelease t;
+                    if ( !authConn_->runCommand( fromdb, BSON( "authenticate" << 1 << "user" << username << "nonce" << nonce << "key" << key ), ret ) ) {
+                        errmsg = "unable to login " + ret.toString();
+                        return false;
+                    }
+                }
+                c.setConnection( authConn_.release() );
+            }
+            Client::Context ctx(todb);
+            bool res = c.go(fromhost.c_str(), errmsg, fromdb, /*logForReplication=*/!fromRepl, slaveOk, /*replauth*/false, /*snapshot*/true, /*mayYield*/true, /*mayBeInterrupted*/ false);
+            return res;
+        }
+    } cmdcopydb;
+
+    class CmdRenameCollection : public Command {
+    public:
+        CmdRenameCollection() : Command( "renameCollection" ) {}
+        virtual bool adminOnly() const {
+            return true;
+        }
+        virtual bool requiresAuth() { return false; } // do our own auth
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual LockType locktype() const { return WRITE; }
+        virtual bool logTheOp() {
+            return true; // can't log steps when doing fast rename within a db, so always log the op rather than individual steps comprising it.
+        }
+        virtual void help( stringstream &help ) const {
+            help << " example: { renameCollection: foo.a, to: bar.b }";
+        }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string source = cmdObj.getStringField( name.c_str() );
+            string target = cmdObj.getStringField( "to" );
+            uassert(15967,"invalid collection name: " + target, NamespaceString::validCollectionName(target.c_str()));
+            if ( source.empty() || target.empty() ) {
+                errmsg = "invalid command syntax";
+                return false;
+            }
+
+            bool capped = false;
+            long long size = 0;
+            {
+                Client::Context ctx( source ); // auths against source
+                NamespaceDetails *nsd = nsdetails( source.c_str() );
+                uassert( 10026 ,  "source namespace does not exist", nsd );
+                capped = nsd->capped;
+                if ( capped )
+                    for( DiskLoc i = nsd->firstExtent; !i.isNull(); i = i.ext()->xnext )
+                        size += i.ext()->length;
+            }
+
+            Client::Context ctx( target ); //auths against target
+
+            if ( nsdetails( target.c_str() ) ) {
+                uassert( 10027 ,  "target namespace exists", cmdObj["dropTarget"].trueValue() );
+                BSONObjBuilder bb( result.subobjStart( "dropTarget" ) );
+                dropCollection( target , errmsg , bb );
+                bb.done();
+                if ( errmsg.size() > 0 )
+                    return false;
+            }
+
+            {
+                char from[256];
+                nsToDatabase( source.c_str(), from );
+                char to[256];
+                nsToDatabase( target.c_str(), to );
+                if ( strcmp( from, to ) == 0 ) {
+                    renameNamespace( source.c_str(), target.c_str() );
+                    // make sure we drop counters etc
+                    Top::global.collectionDropped( source );
+                    return true;
+                }
+            }
+
+            BSONObjBuilder spec;
+            if ( capped ) {
+                spec.appendBool( "capped", true );
+                spec.append( "size", double( size ) );
+            }
+            if ( !userCreateNS( target.c_str(), spec.done(), errmsg, false ) )
+                return false;
+
+            auto_ptr< DBClientCursor > c;
+            DBDirectClient bridge;
+
+            {
+                c = bridge.query( source, BSONObj() );
+            }
+            while( 1 ) {
+                {
+                    if ( !c->more() )
+                        break;
+                }
+                BSONObj o = c->next();
+                theDataFileMgr.insertWithObjMod( target.c_str(), o );
+            }
+
+            char cl[256];
+            nsToDatabase( source.c_str(), cl );
+            string sourceIndexes = string( cl ) + ".system.indexes";
+            nsToDatabase( target.c_str(), cl );
+            string targetIndexes = string( cl ) + ".system.indexes";
+            {
+                c = bridge.query( sourceIndexes, QUERY( "ns" << source ) );
+            }
+            while( 1 ) {
+                {
+                    if ( !c->more() )
+                        break;
+                }
+                BSONObj o = c->next();
+                BSONObjBuilder b;
+                BSONObjIterator i( o );
+                while( i.moreWithEOO() ) {
+                    BSONElement e = i.next();
+                    if ( e.eoo() )
+                        break;
+                    if ( strcmp( e.fieldName(), "ns" ) == 0 ) {
+                        b.append( "ns", target );
+                    }
+                    else {
+                        b.append( e );
+                    }
+                }
+                BSONObj n = b.done();
+                theDataFileMgr.insertWithObjMod( targetIndexes.c_str(), n );
+            }
+
+            {
+                Client::Context ctx( source );
+                dropCollection( source, errmsg, result );
+            }
+            return true;
+        }
+    } cmdrenamecollection;
+
+} // namespace mongo
diff --git a/src/mongo/db/cloner.h b/src/mongo/db/cloner.h
new file mode 100644
index 00000000000..130fea0fac1
--- /dev/null
+++ b/src/mongo/db/cloner.h
@@ -0,0 +1,39 @@
+// cloner.h - copy a database (export/import basically)
+
+/**
+ *    Copyright (C) 2011 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+
+namespace mongo {
+    
+    /**
+     * @param slaveOk     - if true it is ok if the source of the data is !ismaster.
+     * @param useReplAuth - use the credentials we normally use as a replication slave for the cloning
+     * @param snapshot    - use $snapshot mode for copying collections.  note this should not be used when it isn't required, as it will be slower.
+     *                      for example repairDatabase need not use it.
+     * @param errCode     - If provided, this will be set on error to the server's error code.  Currently
+     *                      this will only be set if there is an error in the initial system.namespaces query.
+     */
+    bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication,
+                   bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield,
+                   bool mayBeInterrupted, int *errCode = 0);
+
+    bool copyCollectionFromRemote(const string& host, const string& ns, string& errmsg);
+
+} // namespace mongo
diff --git a/src/mongo/db/cmdline.cpp b/src/mongo/db/cmdline.cpp
new file mode 100644
index 00000000000..a9b0d7097ca
--- /dev/null
+++ b/src/mongo/db/cmdline.cpp
@@ -0,0 +1,519 @@
+// cmdline.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "commands.h"
+#include "../util/password.h"
+#include "../util/processinfo.h"
+#include "../util/net/listen.h"
+#include "security_common.h"
+#ifdef _WIN32
+#include <direct.h>
+#else
+#include <sys/types.h>
+#include <sys/wait.h>
+#endif
+#include "globals.h"
+
+#define MAX_LINE_LENGTH 256
+
+namespace po = boost::program_options;
+namespace fs = boost::filesystem;
+
+namespace mongo {
+
+    void setupSignals( bool inFork );
+    string getHostNameCached();
+    static BSONArray argvArray;
+    static BSONObj parsedOpts;
+
+    void CmdLine::addGlobalOptions( boost::program_options::options_description& general ,
+                                    boost::program_options::options_description& hidden ) {
+        /* support for -vv -vvvv etc. */
+        for (string s = "vv"; s.length() <= 12; s.append("v")) {
+            hidden.add_options()(s.c_str(), "verbose");
+        }
+
+        general.add_options()
+        ("help,h", "show this usage information")
+        ("version", "show version information")
+        ("config,f", po::value<string>(), "configuration file specifying additional options")
+        ("verbose,v", "be more verbose (include multiple times for more verbosity e.g. -vvvvv)")
+        ("quiet", "quieter output")
+        ("port", po::value<int>(&cmdLine.port), "specify port number")
+        ("bind_ip", po::value<string>(&cmdLine.bind_ip), "comma separated list of ip addresses to listen on - all local ips by default")
+        ("maxConns",po::value<int>(), "max number of simultaneous connections")
+        ("objcheck", "inspect client data for validity on receipt")
+        ("logpath", po::value<string>() , "log file to send write to instead of stdout - has to be a file, not directory" )
+        ("logappend" , "append to logpath instead of over-writing" )
+        ("pidfilepath", po::value<string>(), "full path to pidfile (if not set, no pidfile is created)")
+        ("keyFile", po::value<string>(), "private key for cluster authentication (only for replica sets)")
+#ifndef _WIN32
+        ("nounixsocket", "disable listening on unix sockets")
+        ("unixSocketPrefix", po::value<string>(), "alternative directory for UNIX domain sockets (defaults to /tmp)")
+        ("fork" , "fork server process" )
+        ("syslog" , "log to system's syslog facility instead of file or stdout" )
+#endif
+        ;
+        
+        hidden.add_options()
+        ("cloud", po::value<string>(), "custom dynamic host naming")
+#ifdef MONGO_SSL
+        ("sslOnNormalPorts" , "use ssl on configured ports" )
+        ("sslPEMKeyFile" , po::value<string>(&cmdLine.sslPEMKeyFile), "PEM file for ssl" )
+        ("sslPEMKeyPassword" , new PasswordValue(&cmdLine.sslPEMKeyPassword) , "PEM file password" )
+#endif
+        ;
+
+    }
+
+
+#if defined(_WIN32)
+    void CmdLine::addWindowsOptions( boost::program_options::options_description& windows ,
+                                     boost::program_options::options_description& hidden ) {
+        windows.add_options()
+        ("install", "install mongodb service")
+        ("remove", "remove mongodb service")
+        ("reinstall", "reinstall mongodb service (equivilant of mongod --remove followed by mongod --install)")
+        ("serviceName", po::value<string>(), "windows service name")
+        ("serviceDisplayName", po::value<string>(), "windows service display name")
+        ("serviceDescription", po::value<string>(), "windows service description")
+        ("serviceUser", po::value<string>(), "user name service executes as")
+        ("servicePassword", po::value<string>(), "password used to authenticate serviceUser")
+        ;
+        hidden.add_options()("service", "start mongodb service");
+    }
+#endif
+
+    void CmdLine::parseConfigFile( istream &f, stringstream &ss ) {
+        string s;
+        char line[MAX_LINE_LENGTH];
+
+        while ( f ) {
+            f.getline(line, MAX_LINE_LENGTH);
+            s = line;
+            std::remove(s.begin(), s.end(), ' ');
+            std::remove(s.begin(), s.end(), '\t');
+            boost::to_upper(s);
+
+            if ( s.find( "FASTSYNC" ) != string::npos )
+                cout << "warning \"fastsync\" should not be put in your configuration file" << endl;
+
+            if ( s.c_str()[0] == '#' ) { 
+                // skipping commented line
+            } else if ( s.find( "=FALSE" ) == string::npos ) {
+                ss << line << endl;
+            } else {
+                cout << "warning: remove or comment out this line by starting it with \'#\', skipping now : " << line << endl;
+            }
+        }
+        return;
+    }
+
+#ifndef _WIN32
+    // support for exit value propagation with fork
+    void launchSignal( int sig ) {
+        if ( sig == SIGUSR2 ) {
+            pid_t cur = getpid();
+            
+            if ( cur == cmdLine.parentProc || cur == cmdLine.leaderProc ) {
+                // signal indicates successful start allowing us to exit
+                _exit(0);
+            } 
+        }
+    }
+
+    void setupLaunchSignals() {
+        assert( signal(SIGUSR2 , launchSignal ) != SIG_ERR );
+    }
+
+
+    void CmdLine::launchOk() {
+        if ( cmdLine.doFork ) {
+            // killing leader will propagate to parent
+            assert( kill( cmdLine.leaderProc, SIGUSR2 ) == 0 );
+        }
+    }
+#endif
+
+    bool CmdLine::store( int argc , char ** argv ,
+                         boost::program_options::options_description& visible,
+                         boost::program_options::options_description& hidden,
+                         boost::program_options::positional_options_description& positional,
+                         boost::program_options::variables_map &params ) {
+
+
+        {
+            // setup binary name
+            cmdLine.binaryName = argv[0];
+            size_t i = cmdLine.binaryName.rfind( '/' );
+            if ( i != string::npos )
+                cmdLine.binaryName = cmdLine.binaryName.substr( i + 1 );
+            
+            // setup cwd
+            char buffer[1024];
+#ifdef _WIN32
+            assert( _getcwd( buffer , 1000 ) );
+#else
+            assert( getcwd( buffer , 1000 ) );
+#endif
+            cmdLine.cwd = buffer;
+        }
+        
+
+        /* don't allow guessing - creates ambiguities when some options are
+         * prefixes of others. allow long disguises and don't allow guessing
+         * to get away with our vvvvvvv trick. */
+        int style = (((po::command_line_style::unix_style ^
+                       po::command_line_style::allow_guessing) |
+                      po::command_line_style::allow_long_disguise) ^
+                     po::command_line_style::allow_sticky);
+
+
+        try {
+
+            po::options_description all;
+            all.add( visible );
+            all.add( hidden );
+
+            po::store( po::command_line_parser(argc, argv)
+                       .options( all )
+                       .positional( positional )
+                       .style( style )
+                       .run(),
+                       params );
+
+            if ( params.count("config") ) {
+                ifstream f( params["config"].as<string>().c_str() );
+                if ( ! f.is_open() ) {
+                    cout << "ERROR: could not read from config file" << endl << endl;
+                    cout << visible << endl;
+                    return false;
+                }
+
+                stringstream ss;
+                CmdLine::parseConfigFile( f, ss );
+                po::store( po::parse_config_file( ss , all ) , params );
+                f.close();
+            }
+
+            po::notify(params);
+        }
+        catch (po::error &e) {
+            cout << "error command line: " << e.what() << endl;
+            cout << "use --help for help" << endl;
+            //cout << visible << endl;
+            return false;
+        }
+
+        if (params.count("verbose")) {
+            logLevel = 1;
+        }
+
+        for (string s = "vv"; s.length() <= 12; s.append("v")) {
+            if (params.count(s)) {
+                logLevel = s.length();
+            }
+        }
+
+        if (params.count("quiet")) {
+            cmdLine.quiet = true;
+        }
+
+        if ( params.count( "maxConns" ) ) {
+            int newSize = params["maxConns"].as<int>();
+            if ( newSize < 5 ) {
+                out() << "maxConns has to be at least 5" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            else if ( newSize >= 10000000 ) {
+                out() << "maxConns can't be greater than 10000000" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            connTicketHolder.resize( newSize );
+        }
+
+        if (params.count("objcheck")) {
+            cmdLine.objcheck = true;
+        }
+
+        string logpath;
+
+#ifndef _WIN32
+        if (params.count("unixSocketPrefix")) {
+            cmdLine.socket = params["unixSocketPrefix"].as<string>();
+            if (!fs::is_directory(cmdLine.socket)) {
+                cout << cmdLine.socket << " must be a directory" << endl;
+                ::exit(-1);
+            }
+        }
+
+        if (params.count("nounixsocket")) {
+            cmdLine.noUnixSocket = true;
+        }
+
+        if (params.count("fork")) {
+            cmdLine.doFork = true;
+            if ( ! params.count( "logpath" ) && ! params.count( "syslog" ) ) {
+                cout << "--fork has to be used with --logpath or --syslog" << endl;
+                ::exit(-1);
+            }
+
+            if ( params.count( "logpath" ) ) {
+                // test logpath
+                logpath = params["logpath"].as<string>();
+                assert( logpath.size() );
+                if ( logpath[0] != '/' ) {
+                    logpath = cmdLine.cwd + "/" + logpath;
+                }
+                FILE * test = fopen( logpath.c_str() , "a" );
+                if ( ! test ) {
+                    cout << "can't open [" << logpath << "] for log file: " << errnoWithDescription() << endl;
+                    ::exit(-1);
+                }
+                fclose( test );
+            }
+
+            cout.flush();
+            cerr.flush();
+            
+            cmdLine.parentProc = getpid();
+            
+            // facilitate clean exit when child starts successfully
+            setupLaunchSignals();
+
+            pid_t c = fork();
+            if ( c ) {
+                int pstat;
+                waitpid(c, &pstat, 0);
+
+                if ( WIFEXITED(pstat) ) {
+                    if ( ! WEXITSTATUS(pstat) ) {
+                        cout << "child process started successfully, parent exiting" << endl;
+                    }
+
+                    _exit( WEXITSTATUS(pstat) );
+                }
+
+                _exit(50);
+            }
+
+            if ( chdir("/") < 0 ) {
+                cout << "Cant chdir() while forking server process: " << strerror(errno) << endl;
+                ::exit(-1);
+            }
+            setsid();
+            
+            cmdLine.leaderProc = getpid();
+
+            pid_t c2 = fork();
+            if ( c2 ) {
+                int pstat;
+                cout << "forked process: " << c2 << endl;
+                waitpid(c2, &pstat, 0);
+
+                if ( WIFEXITED(pstat) ) {
+                    _exit( WEXITSTATUS(pstat) );
+                }
+
+                _exit(51);
+            }
+
+            // stdout handled in initLogging
+            //fclose(stdout);
+            //freopen("/dev/null", "w", stdout);
+
+            fclose(stderr);
+            fclose(stdin);
+
+            FILE* f = freopen("/dev/null", "w", stderr);
+            if ( f == NULL ) {
+                cout << "Cant reassign stderr while forking server process: " << strerror(errno) << endl;
+                ::exit(-1);
+            }
+
+            f = freopen("/dev/null", "r", stdin);
+            if ( f == NULL ) {
+                cout << "Cant reassign stdin while forking server process: " << strerror(errno) << endl;
+                ::exit(-1);
+            }
+
+            setupCoreSignals();
+            setupSignals( true );
+        }
+        
+        if (params.count("syslog")) {
+            StringBuilder sb(128);
+            sb << cmdLine.binaryName << "." << cmdLine.port;
+            Logstream::useSyslog( sb.str().c_str() );
+        }
+#endif
+        if (params.count("logpath")) {
+            if ( params.count("syslog") ) {
+                cout << "Cant use both a logpath and syslog " << endl;
+                ::exit(-1);
+            }
+            
+            if ( logpath.size() == 0 )
+                logpath = params["logpath"].as<string>();
+            uassert( 10033 ,  "logpath has to be non-zero" , logpath.size() );
+            initLogging( logpath , params.count( "logappend" ) );
+        }
+
+        if ( params.count("pidfilepath")) {
+            writePidFile( params["pidfilepath"].as<string>() );
+        }
+
+        if (params.count("keyFile")) {
+            const string f = params["keyFile"].as<string>();
+
+            if (!setUpSecurityKey(f)) {
+                // error message printed in setUpPrivateKey
+                dbexit(EXIT_BADOPTIONS);
+            }
+
+            cmdLine.keyFile = true;
+            noauth = false;
+        }
+        else {
+            cmdLine.keyFile = false;
+        }
+
+#ifdef MONGO_SSL
+        if (params.count("sslOnNormalPorts") ) {
+            cmdLine.sslOnNormalPorts = true;
+
+            if ( cmdLine.sslPEMKeyPassword.size() == 0 ) {
+                log() << "need sslPEMKeyPassword" << endl;
+                dbexit(EXIT_BADOPTIONS);
+            }
+            
+            if ( cmdLine.sslPEMKeyFile.size() == 0 ) {
+                log() << "need sslPEMKeyFile" << endl;
+                dbexit(EXIT_BADOPTIONS);
+            }
+            
+            cmdLine.sslServerManager = new SSLManager( false );
+            cmdLine.sslServerManager->setupPEM( cmdLine.sslPEMKeyFile , cmdLine.sslPEMKeyPassword );
+        }
+
+        if ( cmdLine.sslPEMKeyFile.size() || cmdLine.sslPEMKeyPassword.size() ) {
+            log() << "need to enable sslOnNormalPorts" << endl;
+            dbexit(EXIT_BADOPTIONS);
+        }
+#endif
+        
+        {
+            BSONObjBuilder b;
+            for (po::variables_map::const_iterator it(params.begin()), end(params.end()); it != end; it++){
+                if (!it->second.defaulted()){
+                    const string& key = it->first;
+                    const po::variable_value& value = it->second;
+                    const type_info& type = value.value().type();
+
+                    if (type == typeid(string)){
+                        if (value.as<string>().empty())
+                            b.appendBool(key, true); // boost po uses empty string for flags like --quiet
+                        else
+                            b.append(key, value.as<string>());
+                    }
+                    else if (type == typeid(int))
+                        b.append(key, value.as<int>());
+                    else if (type == typeid(double))
+                        b.append(key, value.as<double>());
+                    else if (type == typeid(bool))
+                        b.appendBool(key, value.as<bool>());
+                    else if (type == typeid(long))
+                        b.appendNumber(key, (long long)value.as<long>());
+                    else if (type == typeid(unsigned))
+                        b.appendNumber(key, (long long)value.as<unsigned>());
+                    else if (type == typeid(unsigned long long))
+                        b.appendNumber(key, (long long)value.as<unsigned long long>());
+                    else if (type == typeid(vector<string>))
+                        b.append(key, value.as<vector<string> >());
+                    else
+                        b.append(key, "UNKNOWN TYPE: " + demangleName(type));
+                }
+            }
+            parsedOpts = b.obj();
+        }
+
+        {
+            BSONArrayBuilder b;
+            for (int i=0; i < argc; i++)
+                b << argv[i];
+            argvArray = b.arr();
+        }
+
+        return true;
+    }
+
+    void printCommandLineOpts() {
+        log() << "options: " << parsedOpts << endl;
+    }
+
+    void ignoreSignal( int sig ) {}
+
+    void setupCoreSignals() {
+#if !defined(_WIN32)
+        assert( signal(SIGUSR1 , rotateLogs ) != SIG_ERR );
+        assert( signal(SIGHUP , ignoreSignal ) != SIG_ERR );
+#endif
+    }
+
+    class CmdGetCmdLineOpts : Command {
+    public:
+        CmdGetCmdLineOpts(): Command("getCmdLineOpts") {}
+        void help(stringstream& h) const { h << "get argv"; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool adminOnly() const { return true; }
+        virtual bool slaveOk() const { return true; }
+
+        virtual bool run(const string&, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            result.append("argv", argvArray);
+            result.append("parsed", parsedOpts);
+            return true;
+        }
+
+    } cmdGetCmdLineOpts;
+
+    string prettyHostName() {
+        StringBuilder s(128);
+        s << getHostNameCached();
+        if( cmdLine.port != CmdLine::DefaultDBPort )
+            s << ':' << mongo::cmdLine.port;
+        return s.str();
+    }
+
+    casi< map<string,ParameterValidator*> * > pv_all (NULL);
+
+    ParameterValidator::ParameterValidator( const string& name ) : _name( name ) {
+        if ( ! pv_all)
+            pv_all.ref() = new map<string,ParameterValidator*>();
+        (*pv_all.ref())[_name] = this;
+    }
+    
+    ParameterValidator * ParameterValidator::get( const string& name ) {
+        map<string,ParameterValidator*>::const_iterator i = pv_all.get()->find( name );
+        if ( i == pv_all.get()->end() )
+            return NULL;
+        return i->second;
+    }
+
+}
diff --git a/src/mongo/db/cmdline.h b/src/mongo/db/cmdline.h
new file mode 100644
index 00000000000..5fe6ceb1005
--- /dev/null
+++ b/src/mongo/db/cmdline.h
@@ -0,0 +1,203 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+#ifdef MONGO_SSL
+    class SSLManager;
+#endif
+
+    /* command line options
+    */
+    /* concurrency: OK/READ */
+    struct CmdLine {
+
+        CmdLine();
+
+        string binaryName;     // mongod or mongos
+        string cwd;            // cwd of when process started
+
+        // this is suboptimal as someone could rename a binary.  todo...
+        bool isMongos() const { return binaryName == "mongos"; }
+
+        int port;              // --port
+        enum {
+            DefaultDBPort = 27017,
+            ConfigServerPort = 27019,
+            ShardServerPort = 27018
+        };
+        bool isDefaultPort() const { return port == DefaultDBPort; }
+
+        string bind_ip;        // --bind_ip
+        bool rest;             // --rest
+        bool jsonp;            // --jsonp
+
+        string _replSet;       // --replSet[/<seedlist>]
+        string ourSetName() const {
+            string setname;
+            size_t sl = _replSet.find('/');
+            if( sl == string::npos )
+                return _replSet;
+            return _replSet.substr(0, sl);
+        }
+        bool usingReplSets() const { return !_replSet.empty(); }
+
+        // for master/slave replication
+        string source;         // --source
+        string only;           // --only
+
+        bool quiet;            // --quiet
+        bool noTableScan;      // --notablescan no table scans allowed
+        bool prealloc;         // --noprealloc no preallocation of data files
+        bool preallocj;        // --nopreallocj no preallocation of journal files
+        bool smallfiles;       // --smallfiles allocate smaller data files
+
+        bool configsvr;        // --configsvr
+
+        bool quota;            // --quota
+        int quotaFiles;        // --quotaFiles
+        bool cpu;              // --cpu show cpu time periodically
+
+        bool dur;                       // --dur durability (now --journal)
+        unsigned journalCommitInterval; // group/batch commit interval ms
+
+        /** --durOptions 7      dump journal and terminate without doing anything further
+            --durOptions 4      recover and terminate without listening
+        */
+        enum { // bits to be ORed
+            DurDumpJournal = 1,   // dump diagnostics on the journal during recovery
+            DurScanOnly = 2,      // don't do any real work, just scan and dump if dump specified
+            DurRecoverOnly = 4,   // terminate after recovery step
+            DurParanoid = 8,      // paranoid mode enables extra checks
+            DurAlwaysCommit = 16, // do a group commit every time the writelock is released
+            DurAlwaysRemap = 32,  // remap the private view after every group commit (may lag to the next write lock acquisition, but will do all files then)
+            DurNoCheckSpace = 64  // don't check that there is enough room for journal files before startup (for diskfull tests)
+        };
+        int durOptions;          // --durOptions <n> for debugging
+
+        bool objcheck;         // --objcheck
+
+        long long oplogSize;   // --oplogSize
+        int defaultProfile;    // --profile
+        int slowMS;            // --time in ms that is "slow"
+
+        int pretouch;          // --pretouch for replication application (experimental)
+        bool moveParanoia;     // for move chunk paranoia
+        double syncdelay;      // seconds between fsyncs
+
+        bool noUnixSocket;     // --nounixsocket
+        bool doFork;           // --fork
+        string socket;         // UNIX domain socket directory
+
+        bool keyFile;
+
+#ifndef _WIN32
+        pid_t parentProc;      // --fork pid of initial process
+        pid_t leaderProc;      // --fork pid of leader process
+#endif
+
+#ifdef MONGO_SSL
+        bool sslOnNormalPorts;      // --sslOnNormalPorts
+        string sslPEMKeyFile;       // --sslPEMKeyFile
+        string sslPEMKeyPassword;   // --sslPEMKeyPassword
+
+        SSLManager* sslServerManager; // currently leaks on close
+#endif
+        
+        static void launchOk();
+
+        static void addGlobalOptions( boost::program_options::options_description& general ,
+                                      boost::program_options::options_description& hidden );
+
+        static void addWindowsOptions( boost::program_options::options_description& windows ,
+                                       boost::program_options::options_description& hidden );
+
+
+        static void parseConfigFile( istream &f, stringstream &ss);
+        /**
+         * @return true if should run program, false if should exit
+         */
+        static bool store( int argc , char ** argv ,
+                           boost::program_options::options_description& visible,
+                           boost::program_options::options_description& hidden,
+                           boost::program_options::positional_options_description& positional,
+                           boost::program_options::variables_map &output );
+
+        time_t started;
+    };
+
+    // todo move to cmdline.cpp?
+    inline CmdLine::CmdLine() :
+        port(DefaultDBPort), rest(false), jsonp(false), quiet(false), noTableScan(false), prealloc(true), preallocj(true), smallfiles(sizeof(int*) == 4),
+        configsvr(false),
+        quota(false), quotaFiles(8), cpu(false), durOptions(0), objcheck(false), oplogSize(0), defaultProfile(0), slowMS(100), pretouch(0), moveParanoia( true ),
+        syncdelay(60), noUnixSocket(false), doFork(0), socket("/tmp") 
+    {
+        started = time(0);
+
+        journalCommitInterval = 0; // 0 means use default
+        dur = false;
+#if defined(_DURABLEDEFAULTON)
+        dur = true;
+#endif
+        if( sizeof(void*) == 8 )
+            dur = true;
+#if defined(_DURABLEDEFAULTOFF)
+        dur = false;
+#endif
+
+#ifdef MONGO_SSL
+        sslOnNormalPorts = false;
+        sslServerManager = 0;
+#endif
+    }
+            
+    extern CmdLine cmdLine;
+
+    void setupLaunchSignals();
+    void setupCoreSignals();
+
+    string prettyHostName();
+
+    void printCommandLineOpts();
+
+    /**
+     * used for setParameter command
+     * so you can write validation code that lives with code using it
+     * rather than all in the command place
+     * also lets you have mongos or mongod specific code
+     * without pulling it all sorts of things
+     */
+    class ParameterValidator {
+    public:
+        ParameterValidator( const string& name );
+        virtual ~ParameterValidator() {}
+
+        virtual bool isValid( BSONElement e , string& errmsg ) const = 0;
+
+        static ParameterValidator * get( const string& name );
+
+    private:
+        const string _name;
+    };
+
+}
+
diff --git a/src/mongo/db/collection.h b/src/mongo/db/collection.h
new file mode 100644
index 00000000000..998b2f0beac
--- /dev/null
+++ b/src/mongo/db/collection.h
@@ -0,0 +1,15 @@
+// @file collection.h
+
+#pragma once
+
+#include "namespace.h"
+
+namespace mongo { 
+
+    class Collection { 
+    public:
+        NamespaceDetails * const d;
+        NamespaceDetailsTransient * const nsd;
+    };
+
+}
diff --git a/src/mongo/db/commands.cpp b/src/mongo/db/commands.cpp
new file mode 100755
index 00000000000..cbe9ffc6861
--- /dev/null
+++ b/src/mongo/db/commands.cpp
@@ -0,0 +1,209 @@
+/* commands.cpp
+   db "commands" (sent via db.$cmd.findOne(...))
+ */
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "jsobj.h"
+#include "commands.h"
+#include "client.h"
+#include "replutil.h"
+
+namespace mongo {
+
+    map<string,Command*> * Command::_commandsByBestName;
+    map<string,Command*> * Command::_webCommands;
+    map<string,Command*> * Command::_commands;
+
+    string Command::parseNsFullyQualified(const string& dbname, const BSONObj& cmdObj) const { 
+        string s = cmdObj.firstElement().valuestr();
+        NamespaceString nss(s);
+        // these are for security, do not remove:
+        verify(15966, dbname == nss.db || dbname == "admin" );
+        verify(15962, !nss.db.empty() );
+        return s;
+    }
+
+    /*virtual*/ string Command::parseNs(const string& dbname, const BSONObj& cmdObj) const {
+        string coll = cmdObj.firstElement().valuestr();
+#if defined(CLC)
+        DEV if( mongoutils::str::startsWith(coll, dbname+'.') ) { 
+            log() << "DEBUG parseNs Command's collection name looks like it includes the db name\n"
+                << dbname << '\n' 
+                << coll << '\n'
+                << cmdObj.toString() << endl;
+            dassert(false);
+        }
+#endif
+        return dbname + '.' + coll;
+    }
+
+    void Command::htmlHelp(stringstream& ss) const {
+        string helpStr;
+        {
+            stringstream h;
+            help(h);
+            helpStr = h.str();
+        }
+        ss << "\n<tr><td>";
+        bool web = _webCommands->count(name) != 0;
+        if( web ) ss << "<a href=\"/" << name << "?text=1\">";
+        ss << name;
+        if( web ) ss << "</a>";
+        ss << "</td>\n";
+        ss << "<td>";
+        int l = locktype();
+        //if( l == NONE ) ss << "N ";
+        if( l == READ ) ss << "R ";
+        else if( l == WRITE ) ss << "W ";
+        if( slaveOk() )
+            ss << "S ";
+        if( adminOnly() )
+            ss << "A";
+        ss << "</td>";
+        ss << "<td>";
+        if( helpStr != "no help defined" ) {
+            const char *p = helpStr.c_str();
+            while( *p ) {
+                if( *p == '<' ) {
+                    ss << "&lt;";
+                    p++; continue;
+                }
+                else if( *p == '{' )
+                    ss << "<code>";
+                else if( *p == '}' ) {
+                    ss << "}</code>";
+                    p++;
+                    continue;
+                }
+                if( strncmp(p, "http:", 5) == 0 ) {
+                    ss << "<a href=\"";
+                    const char *q = p;
+                    while( *q && *q != ' ' && *q != '\n' )
+                        ss << *q++;
+                    ss << "\">";
+                    q = p;
+                    if( startsWith(q, "http://www.mongodb.org/display/") )
+                        q += 31;
+                    while( *q && *q != ' ' && *q != '\n' ) {
+                        ss << (*q == '+' ? ' ' : *q);
+                        q++;
+                        if( *q == '#' )
+                            while( *q && *q != ' ' && *q != '\n' ) q++;
+                    }
+                    ss << "</a>";
+                    p = q;
+                    continue;
+                }
+                if( *p == '\n' ) ss << "<br>";
+                else ss << *p;
+                p++;
+            }
+        }
+        ss << "</td>";
+        ss << "</tr>\n";
+    }
+
+    Command::Command(const char *_name, bool web, const char *oldName) : name(_name) {
+        // register ourself.
+        if ( _commands == 0 )
+            _commands = new map<string,Command*>;
+        if( _commandsByBestName == 0 )
+            _commandsByBestName = new map<string,Command*>;
+        Command*& c = (*_commands)[name];
+        if ( c )
+            log() << "warning: 2 commands with name: " << _name << endl;
+        c = this;
+        (*_commandsByBestName)[name] = this;
+
+        if( web ) {
+            if( _webCommands == 0 )
+                _webCommands = new map<string,Command*>;
+            (*_webCommands)[name] = this;
+        }
+
+        if( oldName )
+            (*_commands)[oldName] = this;
+    }
+
+    void Command::help( stringstream& help ) const {
+        help << "no help defined";
+    }
+
+    Command* Command::findCommand( const string& name ) {
+        map<string,Command*>::iterator i = _commands->find( name );
+        if ( i == _commands->end() )
+            return 0;
+        return i->second;
+    }
+
+
+    Command::LockType Command::locktype( const string& name ) {
+        Command * c = findCommand( name );
+        if ( ! c )
+            return WRITE;
+        return c->locktype();
+    }
+
+    void Command::logIfSlow( const Timer& timer, const string& msg ) {
+        int ms = timer.millis();
+        if ( ms > cmdLine.slowMS ) {
+            out() << msg << " took " << ms << " ms." << endl;
+        }
+    }
+
+}
+
+#include "../client/connpool.h"
+
+namespace mongo {
+
+    extern DBConnectionPool pool;
+
+    class PoolFlushCmd : public Command {
+    public:
+        PoolFlushCmd() : Command( "connPoolSync" , false , "connpoolsync" ) {}
+        virtual void help( stringstream &help ) const { help<<"internal"; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool run(const string&, mongo::BSONObj&, int, std::string&, mongo::BSONObjBuilder& result, bool) {
+            pool.flush();
+            return true;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+
+    } poolFlushCmd;
+
+    class PoolStats : public Command {
+    public:
+        PoolStats() : Command( "connPoolStats" ) {}
+        virtual void help( stringstream &help ) const { help<<"stats about connection pool"; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool run(const string&, mongo::BSONObj&, int, std::string&, mongo::BSONObjBuilder& result, bool) {
+            pool.appendInfo( result );
+            result.append( "numDBClientConnection" , DBClientConnection::getNumConnections() );
+            result.append( "numAScopedConnection" , AScopedConnection::getNumConnections() );
+            return true;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+
+    } poolStatsCmd;
+
+} // namespace mongo
diff --git a/src/mongo/db/commands.h b/src/mongo/db/commands.h
new file mode 100644
index 00000000000..85cdd38d7a4
--- /dev/null
+++ b/src/mongo/db/commands.h
@@ -0,0 +1,164 @@
+// commands.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+#include "../util/mongoutils/str.h"
+
+namespace mongo {
+
+    class BSONObj;
+    class BSONObjBuilder;
+    class Client;
+    class Timer;
+
+    /** mongodb "commands" (sent via db.$cmd.findOne(...))
+        subclass to make a command.  define a singleton object for it.
+        */
+    class Command {
+    protected:
+        string parseNsFullyQualified(const string& dbname, const BSONObj& cmdObj) const;
+    public:
+        // only makes sense for commands where 1st parm is the collection.
+        virtual string parseNs(const string& dbname, const BSONObj& cmdObj) const;
+
+        enum LockType { READ = -1 , NONE = 0 , WRITE = 1 };
+
+        const string name;
+
+        /* run the given command
+           implement this...
+
+           fromRepl - command is being invoked as part of replication syncing.  In this situation you
+                      normally do not want to log the command to the local oplog.
+
+           return value is true if succeeded.  if false, set errmsg text.
+        */
+        virtual bool run(const string& db, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) = 0;
+
+        /*
+           note: logTheTop() MUST be false if READ
+           if NONE, can't use Client::Context setup
+                    use with caution
+         */
+        virtual LockType locktype() const = 0;
+
+        /* Return true if only the admin ns has privileges to run this command. */
+        virtual bool adminOnly() const {
+            return false;
+        }
+
+        void htmlHelp(stringstream&) const;
+
+        /* Like adminOnly, but even stricter: we must either be authenticated for admin db,
+           or, if running without auth, on the local interface.  Used for things which 
+           are so major that remote invocation may not make sense (e.g., shutdownServer).
+
+           When localHostOnlyIfNoAuth() is true, adminOnly() must also be true.
+        */
+        virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { return false; }
+
+        /* Return true if slaves are allowed to execute the command
+           (the command directly from a client -- if fromRepl, always allowed).
+        */
+        virtual bool slaveOk() const = 0;
+
+        /* Return true if the client force a command to be run on a slave by
+           turning on the 'slaveOk' option in the command query.
+        */
+        virtual bool slaveOverrideOk() {
+            return false;
+        }
+
+        /* Override and return true to if true,log the operation (logOp()) to the replication log.
+           (not done if fromRepl of course)
+
+           Note if run() returns false, we do NOT log.
+        */
+        virtual bool logTheOp() { return false; }
+
+        virtual void help( stringstream& help ) const;
+
+        /* Return true if authentication and security applies to the commands.  Some commands
+           (e.g., getnonce, authenticate) can be done by anyone even unauthorized.
+        */
+        virtual bool requiresAuth() { return true; }
+
+        /* Return true if a replica set secondary should go into "recovering"
+           (unreadable) state while running this command.
+         */
+        virtual bool maintenanceMode() const { return false; }
+
+        /* Return true if command should be permitted when a replica set secondary is in "recovering"
+           (unreadable) state.
+         */
+        virtual bool maintenanceOk() const { return true; /* assumed true prior to commit */ }
+
+        /** @param webUI expose the command in the web ui as localhost:28017/<name>
+            @param oldName an optional old, deprecated name for the command
+        */
+        Command(const char *_name, bool webUI = false, const char *oldName = 0);
+
+        virtual ~Command() {}
+
+    protected:
+        BSONObj getQuery( const BSONObj& cmdObj ) {
+            if ( cmdObj["query"].type() == Object )
+                return cmdObj["query"].embeddedObject();
+            if ( cmdObj["q"].type() == Object )
+                return cmdObj["q"].embeddedObject();
+            return BSONObj();
+        }
+
+        static void logIfSlow( const Timer& cmdTimer,  const string& msg);
+
+        static map<string,Command*> * _commands;
+        static map<string,Command*> * _commandsByBestName;
+        static map<string,Command*> * _webCommands;
+
+    public:
+        static const map<string,Command*>* commandsByBestName() { return _commandsByBestName; }
+        static const map<string,Command*>* webCommands() { return _webCommands; }
+        /** @return if command was found and executed */
+        static bool runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder, int queryOptions = 0);
+        static LockType locktype( const string& name );
+        static Command * findCommand( const string& name );
+    };
+
+    class CmdShutdown : public Command {
+    public:
+        virtual bool requiresAuth() { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { return true; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream& help ) const;
+        CmdShutdown() : Command("shutdown") {}
+        bool run(const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+    private:
+        bool shutdownHelper();
+    };
+
+    bool _runCommands(const char *ns, BSONObj& jsobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions);
+
+} // namespace mongo
diff --git a/src/mongo/db/commands/aggregate.js b/src/mongo/db/commands/aggregate.js
new file mode 100755
index 00000000000..7741e3121ff
--- /dev/null
+++ b/src/mongo/db/commands/aggregate.js
@@ -0,0 +1,184 @@
+/* sample aggregate command queries */
+
+// make sure we're using the right db; this is the same as "use mydb;" in shell
+db = db.getSisterDB("mydb");
+
+// just passing through fields
+var p1 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	tags : 1,
+	pageViews : 1
+    }}
+]});
+
+// unwinding an array
+var p2 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	tag : { $unwind : "tags" },
+	pageViews : 1
+    }}
+]});
+
+// pulling values out of subdocuments
+var p3 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	otherfoo : "other.foo",
+	otherbar : "other.bar"
+    }}
+]});
+
+// projection includes a computed value
+var p4 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	daveWroteIt : { $eq:["$author", "dave"] }
+    }}
+]});
+
+// projection includes a virtual (fabricated) document
+var p5 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	pageViews : 1,
+	tag : { $unwind : "tags" }
+    }},
+    { $project : {
+	author : 1,
+	subDocument : { foo : "pageViews", bar : "tag"  }
+    }}
+]});
+
+// multi-step aggregate
+// nested expressions in computed fields
+var p6 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	tag : { $unwind : "tags" },
+	pageViews : 1
+    }},
+    { $project : {
+	author : 1,
+	tag : 1,
+	pageViews : 1,
+	daveWroteIt : { $eq:["$author", "dave"] },
+	weLikeIt : { $or:[ { $eq:["$author", "dave"] },
+			   { $eq:["$tag", "good"] } ] }
+    }}
+]});
+
+// slightly more complex computed expression; $ifnull
+var p7 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	theSum : { $add:["$pageViews",
+			 { $ifnull:["$other.foo",
+				    "$other.bar"] } ] }
+    }}
+]});
+
+// dotted path inclusion; _id exclusion
+var p8 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	_id : 0,
+	author : 1,
+	tag : { $unwind : "tags" },
+	"comments.author" : 1
+    }}
+]});
+
+
+// simple matching
+var m1 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $match : { author : "dave" } }
+]});
+
+// combining matching with a projection
+var m2 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	title : 1,
+	author : 1,
+	pageViews : 1,
+	tag : { $unwind : "tags" },
+	comments : 1
+    }},
+    { $match : { tag : "nasty" } }
+]});
+
+
+// group by tag
+var g1 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	tag : { $unwind : "tags" },
+	pageViews : 1
+    }},
+    { $group : {
+	_id: { tag : 1 },
+	docsByTag : { $sum : 1 },
+	viewsByTag : { $sum : "$pageViews" }
+    }}
+]});
+
+// $max, and averaging in a final projection
+var g2 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	tag : { $unwind : "tags" },
+	pageViews : 1
+    }},
+    { $group : {
+	_id: { tag : 1 },
+	docsByTag : { $sum : 1 },
+	viewsByTag : { $sum : "$pageViews" },
+	mostViewsByTag : { $max : "$pageViews" },
+    }},
+    { $project : {
+	_id: false,
+	tag : "_id.tag",
+	mostViewsByTag : 1,
+	docsByTag : 1,
+	viewsByTag : 1,
+	avgByTag : { $divide:["$viewsByTag", "$docsByTag"] }
+    }}
+]});
+
+// $push as an accumulator; can pivot data
+var g3 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	tag : { $unwind : "tags" }
+    }},
+    { $group : {
+	_id : { tag : 1 },
+	authors : { $push : "$author" }
+    }}
+]});
+
+// $avg, and averaging in a final projection
+var g4 = db.runCommand(
+{ aggregate : "article", pipeline : [
+    { $project : {
+	author : 1,
+	tag : { $unwind : "tags" },
+	pageViews : 1
+    }},
+    { $group : {
+	_id: { tag : 1 },
+	docsByTag : { $sum : 1 },
+	viewsByTag : { $sum : "$pageViews" },
+	avgByTag : { $avg : "$pageViews" },
+    }}
+]});
diff --git a/src/mongo/db/commands/cloud.cpp b/src/mongo/db/commands/cloud.cpp
new file mode 100644
index 00000000000..8f9d9d2e4b5
--- /dev/null
+++ b/src/mongo/db/commands/cloud.cpp
@@ -0,0 +1,90 @@
+#include "../commands.h"
+#include <map>
+#include "../../util/concurrency/value.h"
+#include "../../util/mongoutils/str.h"
+#include "../../util/net/hostandport.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    mapsf<string,string> dynHostNames;
+    extern DiagStr _hostNameCached;
+
+    string dynHostMyName() {
+        if( !str::startsWith(_hostNameCached, '#') )
+            return "";
+        return _hostNameCached; 
+    }
+
+    void dynHostResolve(string& name, int& port) {
+        assert( !name.empty() );
+        assert( !str::contains(name, ':') );
+        assert( str::startsWith(name, '#') );
+        string s = dynHostNames.get(name);
+        if( s.empty() ) { 
+            name.clear();
+            return;
+        }
+        assert( !str::startsWith(s, '#') );
+        HostAndPort hp(s);
+        if( hp.hasPort() ) {
+            port = hp.port();
+            log() << "info: dynhost in:" << name << " out:" << hp.toString() << endl;
+        }
+        name = hp.host();
+    }
+
+    /** 
+      { cloud:1, nodes: {
+          name : <ip>, ...
+        },
+        me : <mylogicalname>
+      }
+    */
+    class CmdCloud  : public Command {
+    public:
+    virtual LockType locktype() const { return NONE; }
+        virtual bool logTheOp() { return false; }
+        virtual bool adminOnly() const { return true; } // very important
+        virtual bool localHostOnlyIfNoAuth(const BSONObj&) { return true; }
+        virtual bool slaveOk() const { return true; }
+        virtual void help( stringstream& help ) const { 
+            help << "internal\n"; 
+            help << "{cloud:1,nodes:...,me:<my_logical_name>}";
+        }
+        CmdCloud() : Command("cloud") {}
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            assert(!fromRepl);
+            BSONObj nodes = cmdObj["nodes"].Obj();
+            map<string,string> ipmap;
+            for( BSONObj::iterator i(nodes); i.more(); ) { 
+                BSONElement e = i.next();
+                assert( *e.fieldName() == '#' );
+                ipmap[e.fieldName()] = e.String();
+            }
+
+            string me = cmdObj["me"].String();
+            assert( !me.empty() && me[0] == '#' );
+            
+            log(/*1*/) << "CmdCloud" << endl;
+
+            if( me != _hostNameCached.get() ) { 
+                log() << "CmdCloud new 'me' value:" << me << endl;
+                _hostNameCached = me;
+            }
+
+            dynHostNames.swap(ipmap);
+            return true;
+        }
+    } cmdCloud;
+
+    BSONObj fromjson(const string &str);
+
+    void cloudCmdLineParamIs(string cmd) {
+        string errmsg;
+        BSONObjBuilder res;
+        BSONObj o = fromjson(cmd);
+        cmdCloud.run("", o, 0, errmsg, res, false);
+    }
+}
diff --git a/src/mongo/db/commands/distinct.cpp b/src/mongo/db/commands/distinct.cpp
new file mode 100644
index 00000000000..1926e6abddb
--- /dev/null
+++ b/src/mongo/db/commands/distinct.cpp
@@ -0,0 +1,157 @@
+// distinct.cpp
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+//#include "pch.h"
+#include "../commands.h"
+#include "../instance.h"
+#include "../queryoptimizer.h"
+#include "../clientcursor.h"
+#include "../../util/timer.h"
+
+namespace mongo {
+
+    class DistinctCommand : public Command {
+    public:
+        DistinctCommand() : Command("distinct") {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return READ; }
+        virtual void help( stringstream &help ) const {
+            help << "{ distinct : 'collection name' , key : 'a.b' , query : {} }";
+        }
+
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            Timer t;
+            string ns = dbname + '.' + cmdObj.firstElement().valuestr();
+
+            string key = cmdObj["key"].valuestrsafe();
+            BSONObj keyPattern = BSON( key << 1 );
+
+            BSONObj query = getQuery( cmdObj );
+
+            int bufSize = BSONObjMaxUserSize - 4096;
+            BufBuilder bb( bufSize );
+            char * start = bb.buf();
+
+            BSONArrayBuilder arr( bb );
+            BSONElementSet values;
+
+            long long nscanned = 0; // locations looked at
+            long long nscannedObjects = 0; // full objects looked at
+            long long n = 0; // matches
+            MatchDetails md;
+
+            NamespaceDetails * d = nsdetails( ns.c_str() );
+
+            if ( ! d ) {
+                result.appendArray( "values" , BSONObj() );
+                result.append( "stats" , BSON( "n" << 0 << "nscanned" << 0 << "nscannedObjects" << 0 ) );
+                return true;
+            }
+
+            shared_ptr<Cursor> cursor;
+            if ( ! query.isEmpty() ) {
+                cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query , BSONObj() );
+            }
+            else {
+
+                // query is empty, so lets see if we can find an index
+                // with the key so we don't have to hit the raw data
+                NamespaceDetails::IndexIterator ii = d->ii();
+                while ( ii.more() ) {
+                    IndexDetails& idx = ii.next();
+
+                    if ( d->isMultikey( ii.pos() - 1 ) )
+                        continue;
+
+                    if ( idx.inKeyPattern( key ) ) {
+                        cursor = bestGuessCursor( ns.c_str() , BSONObj() , idx.keyPattern() );
+                        if( cursor.get() ) break;
+                    }
+
+                }
+
+                if ( ! cursor.get() )
+                    cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query , BSONObj() );
+
+            }
+
+            
+            assert( cursor );
+            string cursorName = cursor->toString();
+            
+            auto_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns));
+
+            while ( cursor->ok() ) {
+                nscanned++;
+                bool loadedObject = false;
+
+                if ( cursor->currentMatches( &md ) && !cursor->getsetdup( cursor->currLoc() ) ) {
+                    n++;
+
+                    BSONObj holder;
+                    BSONElementSet temp;
+                    loadedObject = ! cc->getFieldsDotted( key , temp, holder );
+
+                    for ( BSONElementSet::iterator i=temp.begin(); i!=temp.end(); ++i ) {
+                        BSONElement e = *i;
+                        if ( values.count( e ) )
+                            continue;
+
+                        int now = bb.len();
+
+                        uassert(10044,  "distinct too big, 16mb cap", ( now + e.size() + 1024 ) < bufSize );
+
+                        arr.append( e );
+                        BSONElement x( start + now );
+
+                        values.insert( x );
+                    }
+                }
+
+                if ( loadedObject || md._loadedObject )
+                    nscannedObjects++;
+
+                cursor->advance();
+
+                if (!cc->yieldSometimes( ClientCursor::MaybeCovered )) {
+                    cc.release();
+                    break;
+                }
+
+                RARELY killCurrentOp.checkForInterrupt();
+            }
+
+            assert( start == bb.buf() );
+
+            result.appendArray( "values" , arr.done() );
+
+            {
+                BSONObjBuilder b;
+                b.appendNumber( "n" , n );
+                b.appendNumber( "nscanned" , nscanned );
+                b.appendNumber( "nscannedObjects" , nscannedObjects );
+                b.appendNumber( "timems" , t.millis() );
+                b.append( "cursor" , cursorName );
+                result.append( "stats" , b.obj() );
+            }
+
+            return true;
+        }
+
+    } distinctCmd;
+
+}
diff --git a/src/mongo/db/commands/document_source_cursor.cpp b/src/mongo/db/commands/document_source_cursor.cpp
new file mode 100755
index 00000000000..49bb9f19d9e
--- /dev/null
+++ b/src/mongo/db/commands/document_source_cursor.cpp
@@ -0,0 +1,100 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/cursor.h"
+#include "db/pipeline/document.h"
+
+namespace mongo {
+
+    DocumentSourceCursor::~DocumentSourceCursor() {
+    }
+
+    bool DocumentSourceCursor::eof() {
+	/* if we haven't gotten the first one yet, do so now */
+	if (!pCurrent.get())
+	    findNext();
+
+        return (pCurrent.get() == NULL);
+    }
+
+    bool DocumentSourceCursor::advance() {
+	/* if we haven't gotten the first one yet, do so now */
+	if (!pCurrent.get())
+	    findNext();
+
+	findNext();
+        return (pCurrent.get() != NULL);
+    }
+
+    intrusive_ptr<Document> DocumentSourceCursor::getCurrent() {
+	/* if we haven't gotten the first one yet, do so now */
+	if (!pCurrent.get())
+	    findNext();
+
+	return pCurrent;
+    }
+
+    void DocumentSourceCursor::findNext() {
+	/* standard cursor usage pattern */
+	while(pCursor->ok()) {
+	    CoveredIndexMatcher *pCIM; // save intermediate result
+	    if ((!(pCIM = pCursor->matcher()) ||
+		 pCIM->matchesCurrent(pCursor.get())) &&
+		!pCursor->getsetdup(pCursor->currLoc())) {
+
+		/* grab the matching document */
+		BSONObj documentObj(pCursor->current());
+		pCurrent = Document::createFromBsonObj(&documentObj);
+		pCursor->advance();
+		return;
+	    }
+
+	    pCursor->advance();
+	}
+
+	/* if we got here, there aren't any more documents */
+	pCurrent.reset();
+    }
+
+    void DocumentSourceCursor::setSource(
+	const intrusive_ptr<DocumentSource> &pSource) {
+	/* this doesn't take a source */
+	assert(false);
+    }
+
+    void DocumentSourceCursor::sourceToBson(BSONObjBuilder *pBuilder) const {
+	/* this has no analog in the BSON world */
+	assert(false);
+    }
+
+    DocumentSourceCursor::DocumentSourceCursor(
+	const shared_ptr<Cursor> &pTheCursor):
+        pCursor(pTheCursor),
+        pCurrent() {
+    }
+
+    intrusive_ptr<DocumentSourceCursor> DocumentSourceCursor::create(
+	const shared_ptr<Cursor> &pCursor) {
+	assert(pCursor.get());
+	intrusive_ptr<DocumentSourceCursor> pSource(
+	    new DocumentSourceCursor(pCursor));
+	    return pSource;
+    }
+}
diff --git a/src/mongo/db/commands/find_and_modify.cpp b/src/mongo/db/commands/find_and_modify.cpp
new file mode 100644
index 00000000000..0cf766fcf87
--- /dev/null
+++ b/src/mongo/db/commands/find_and_modify.cpp
@@ -0,0 +1,153 @@
+// find_and_modify.cpp
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../commands.h"
+#include "../instance.h"
+#include "../clientcursor.h"
+
+namespace mongo {
+
+    /* Find and Modify an object returning either the old (default) or new value*/
+    class CmdFindAndModify : public Command {
+    public:
+        virtual void help( stringstream &help ) const {
+            help <<
+                 "{ findAndModify: \"collection\", query: {processed:false}, update: {$set: {processed:true}}, new: true}\n"
+                 "{ findAndModify: \"collection\", query: {processed:false}, remove: true, sort: {priority:-1}}\n"
+                 "Either update or remove is required, all other fields have default values.\n"
+                 "Output is in the \"value\" field\n";
+        }
+
+        CmdFindAndModify() : Command("findAndModify", false, "findandmodify") { }
+        virtual bool logTheOp() { return false; } // the modifications will be logged directly
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            static DBDirectClient db;
+
+            string ns = dbname + '.' + cmdObj.firstElement().valuestr();
+
+            BSONObj origQuery = cmdObj.getObjectField("query"); // defaults to {}
+            Query q (origQuery);
+            BSONElement sort = cmdObj["sort"];
+            if (!sort.eoo())
+                q.sort(sort.embeddedObjectUserCheck());
+
+            bool upsert = cmdObj["upsert"].trueValue();
+
+            BSONObj fieldsHolder (cmdObj.getObjectField("fields"));
+            const BSONObj* fields = (fieldsHolder.isEmpty() ? NULL : &fieldsHolder);
+
+            Projection projection;
+            if (fields) {
+                projection.init(fieldsHolder);
+                if (!projection.includeID())
+                    fields = NULL; // do projection in post-processing
+            }
+
+            BSONObj out = db.findOne(ns, q, fields);
+            if (out.isEmpty()) {
+                if (!upsert) {
+                    result.appendNull("value");
+                    return true;
+                }
+
+                BSONElement update = cmdObj["update"];
+                uassert(13329, "upsert mode requires update field", !update.eoo());
+                uassert(13330, "upsert mode requires query field", !origQuery.isEmpty());
+                db.update(ns, origQuery, update.embeddedObjectUserCheck(), true);
+
+                BSONObj gle = db.getLastErrorDetailed();
+                result.append("lastErrorObject", gle);
+                if (gle["err"].type() == String) {
+                    errmsg = gle["err"].String();
+                    return false;
+                }
+
+                if (cmdObj["new"].trueValue()) {
+                    BSONElement _id = gle["upserted"];
+                    if (_id.eoo())
+                        _id = origQuery["_id"];
+
+                    out = db.findOne(ns, QUERY("_id" << _id), fields);
+                }
+
+            }
+            else {
+
+                if (cmdObj["remove"].trueValue()) {
+                    uassert(12515, "can't remove and update", cmdObj["update"].eoo());
+                    db.remove(ns, QUERY("_id" << out["_id"]), 1);
+
+                    BSONObj gle = db.getLastErrorDetailed();
+                    result.append("lastErrorObject", gle);
+                    if (gle["err"].type() == String) {
+                        errmsg = gle["err"].String();
+                        return false;
+                    }
+
+                }
+                else {   // update
+
+                    BSONElement queryId = origQuery["_id"];
+                    if (queryId.eoo() || getGtLtOp(queryId) != BSONObj::Equality) {
+                        // need to include original query for $ positional operator
+
+                        BSONObjBuilder b;
+                        b.append(out["_id"]);
+                        BSONObjIterator it(origQuery);
+                        while (it.more()) {
+                            BSONElement e = it.next();
+                            if (strcmp(e.fieldName(), "_id"))
+                                b.append(e);
+                        }
+                        q = Query(b.obj());
+                    }
+
+                    if (q.isComplex()) // update doesn't work with complex queries
+                        q = Query(q.getFilter().getOwned());
+
+                    BSONElement update = cmdObj["update"];
+                    uassert(12516, "must specify remove or update", !update.eoo());
+                    db.update(ns, q, update.embeddedObjectUserCheck());
+
+                    BSONObj gle = db.getLastErrorDetailed();
+                    result.append("lastErrorObject", gle);
+                    if (gle["err"].type() == String) {
+                        errmsg = gle["err"].String();
+                        return false;
+                    }
+
+                    if (cmdObj["new"].trueValue())
+                        out = db.findOne(ns, QUERY("_id" << out["_id"]), fields);
+                }
+            }
+
+            if (!fieldsHolder.isEmpty() && !fields){
+                // we need to run projection but haven't yet
+                out = projection.transform(out);
+            }
+
+            result.append("value", out);
+
+            return true;
+        }
+    } cmdFindAndModify;
+
+
+}
diff --git a/src/mongo/db/commands/group.cpp b/src/mongo/db/commands/group.cpp
new file mode 100644
index 00000000000..69fee587a47
--- /dev/null
+++ b/src/mongo/db/commands/group.cpp
@@ -0,0 +1,224 @@
+// group.cpp
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../commands.h"
+#include "../instance.h"
+#include "../queryoptimizer.h"
+#include "../../scripting/engine.h"
+#include "../clientcursor.h"
+
+namespace mongo {
+
+    class GroupCommand : public Command {
+    public:
+        GroupCommand() : Command("group") {}
+        virtual LockType locktype() const { return READ; }
+        virtual bool slaveOk() const { return false; }
+        virtual bool slaveOverrideOk() { return true; }
+        virtual void help( stringstream &help ) const {
+            help << "http://www.mongodb.org/display/DOCS/Aggregation";
+        }
+
+        BSONObj getKey( const BSONObj& obj , const BSONObj& keyPattern , ScriptingFunction func , double avgSize , Scope * s ) {
+            if ( func ) {
+                BSONObjBuilder b( obj.objsize() + 32 );
+                b.append( "0" , obj );
+                const BSONObj& key = b.obj();
+                int res = s->invoke( func , &key, 0 );
+                uassert( 10041 ,  (string)"invoke failed in $keyf: " + s->getError() , res == 0 );
+                int type = s->type("return");
+                uassert( 10042 ,  "return of $key has to be an object" , type == Object );
+                return s->getObject( "return" );
+            }
+            return obj.extractFields( keyPattern , true ).getOwned();
+        }
+
+        bool group( string realdbname , const string& ns , const BSONObj& query ,
+                    BSONObj keyPattern , string keyFunctionCode , string reduceCode , const char * reduceScope ,
+                    BSONObj initial , string finalize ,
+                    string& errmsg , BSONObjBuilder& result ) {
+
+
+            auto_ptr<Scope> s = globalScriptEngine->getPooledScope( realdbname );
+            s->localConnect( realdbname.c_str() );
+
+            if ( reduceScope )
+                s->init( reduceScope );
+
+            s->setObject( "$initial" , initial , true );
+
+            s->exec( "$reduce = " + reduceCode , "reduce setup" , false , true , true , 100 );
+            s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 );
+            ScriptingFunction f = s->createFunction(
+                                      "function(){ "
+                                      "  if ( $arr[n] == null ){ "
+                                      "    next = {}; "
+                                      "    Object.extend( next , $key ); "
+                                      "    Object.extend( next , $initial , true ); "
+                                      "    $arr[n] = next; "
+                                      "    next = null; "
+                                      "  } "
+                                      "  $reduce( obj , $arr[n] ); "
+                                      "}" );
+
+            ScriptingFunction keyFunction = 0;
+            if ( keyFunctionCode.size() ) {
+                keyFunction = s->createFunction( keyFunctionCode.c_str() );
+            }
+
+
+            double keysize = keyPattern.objsize() * 3;
+            double keynum = 1;
+
+            map<BSONObj,int,BSONObjCmp> map;
+            list<BSONObj> blah;
+
+            shared_ptr<Cursor> cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query);
+            ClientCursor::CleanupPointer ccPointer;
+            ccPointer.reset( new ClientCursor( QueryOption_NoCursorTimeout, cursor, ns ) );
+
+            while ( cursor->ok() ) {
+                
+                if ( !ccPointer->yieldSometimes( ClientCursor::MaybeCovered ) ||
+                    !cursor->ok() ) {
+                    break;
+                }
+                
+                if ( !cursor->currentMatches() || cursor->getsetdup( cursor->currLoc() ) ) {
+                    cursor->advance();
+                    continue;
+                }
+
+                if ( !ccPointer->yieldSometimes( ClientCursor::WillNeed ) ||
+                    !cursor->ok() ) {
+                    break;
+                }
+                
+                BSONObj obj = cursor->current();
+                cursor->advance();
+
+                BSONObj key = getKey( obj , keyPattern , keyFunction , keysize / keynum , s.get() );
+                keysize += key.objsize();
+                keynum++;
+
+                int& n = map[key];
+                if ( n == 0 ) {
+                    n = map.size();
+                    s->setObject( "$key" , key , true );
+
+                    uassert( 10043 ,  "group() can't handle more than 20000 unique keys" , n <= 20000 );
+                }
+
+                s->setObject( "obj" , obj , true );
+                s->setNumber( "n" , n - 1 );
+                if ( s->invoke( f , 0, 0 , 0 , true ) ) {
+                    throw UserException( 9010 , (string)"reduce invoke failed: " + s->getError() );
+                }
+            }
+            ccPointer.reset();
+
+            if (!finalize.empty()) {
+                s->exec( "$finalize = " + finalize , "finalize define" , false , true , true , 100 );
+                ScriptingFunction g = s->createFunction(
+                                          "function(){ "
+                                          "  for(var i=0; i < $arr.length; i++){ "
+                                          "  var ret = $finalize($arr[i]); "
+                                          "  if (ret !== undefined) "
+                                          "    $arr[i] = ret; "
+                                          "  } "
+                                          "}" );
+                s->invoke( g , 0, 0 , 0 , true );
+            }
+
+            result.appendArray( "retval" , s->getObject( "$arr" ) );
+            result.append( "count" , keynum - 1 );
+            result.append( "keys" , (int)(map.size()) );
+            s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 );
+            s->gc();
+
+            return true;
+        }
+
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+
+            if ( !globalScriptEngine ) {
+                errmsg = "server-side JavaScript execution is disabled";
+                return false;
+            }
+            
+            /* db.$cmd.findOne( { group : <p> } ) */
+            const BSONObj& p = jsobj.firstElement().embeddedObjectUserCheck();
+
+            BSONObj q;
+            if ( p["cond"].type() == Object )
+                q = p["cond"].embeddedObject();
+            else if ( p["condition"].type() == Object )
+                q = p["condition"].embeddedObject();
+            else
+                q = getQuery( p );
+
+            if ( p["ns"].type() != String ) {
+                errmsg = "ns has to be set";
+                return false;
+            }
+
+            string ns = dbname + "." + p["ns"].String();
+
+            BSONObj key;
+            string keyf;
+            if ( p["key"].type() == Object ) {
+                key = p["key"].embeddedObjectUserCheck();
+                if ( ! p["$keyf"].eoo() ) {
+                    errmsg = "can't have key and $keyf";
+                    return false;
+                }
+            }
+            else if ( p["$keyf"].type() ) {
+                keyf = p["$keyf"]._asCode();
+            }
+            else {
+                // no key specified, will use entire object as key
+            }
+
+            BSONElement reduce = p["$reduce"];
+            if ( reduce.eoo() ) {
+                errmsg = "$reduce has to be set";
+                return false;
+            }
+
+            BSONElement initial = p["initial"];
+            if ( initial.type() != Object ) {
+                errmsg = "initial has to be an object";
+                return false;
+            }
+
+
+            string finalize;
+            if (p["finalize"].type())
+                finalize = p["finalize"]._asCode();
+
+            return group( dbname , ns , q ,
+                          key , keyf , reduce._asCode() , reduce.type() != CodeWScope ? 0 : reduce.codeWScopeScopeData() ,
+                          initial.embeddedObject() , finalize ,
+                          errmsg , result );
+        }
+
+    } cmdGroup;
+
+
+} // namespace mongo
diff --git a/src/mongo/db/commands/isself.cpp b/src/mongo/db/commands/isself.cpp
new file mode 100644
index 00000000000..ebf6d5bceec
--- /dev/null
+++ b/src/mongo/db/commands/isself.cpp
@@ -0,0 +1,246 @@
+// isself.cpp
+
+#include "pch.h"
+#include "../../util/net/listen.h"
+#include "../commands.h"
+#include "../../client/dbclient.h"
+#include "../security.h"
+
+#include <boost/algorithm/string.hpp>
+
+#ifndef _WIN32
+# ifndef __sunos__
+#  include <ifaddrs.h>
+# endif
+# include <sys/resource.h>
+# include <sys/stat.h>
+
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <errno.h>
+#include <netdb.h>
+#ifdef __openbsd__
+# include <sys/uio.h>
+#endif
+
+#endif
+
+
+namespace mongo {
+
+#if !defined(_WIN32) && !defined(__sunos__)
+
+    vector<string> getMyAddrs() {
+        vector<string> out;
+        ifaddrs * addrs;
+        
+        if ( ! cmdLine.bind_ip.empty() ) {
+            boost::split( out, cmdLine.bind_ip, boost::is_any_of( ", " ) );
+            return out;
+        }
+
+        int status = getifaddrs(&addrs);
+        massert(13469, "getifaddrs failure: " + errnoWithDescription(errno), status == 0);
+
+        // based on example code from linux getifaddrs manpage
+        for (ifaddrs * addr = addrs; addr != NULL; addr = addr->ifa_next) {
+            if ( addr->ifa_addr == NULL ) continue;
+            int family = addr->ifa_addr->sa_family;
+            char host[NI_MAXHOST];
+
+            if (family == AF_INET || family == AF_INET6) {
+                status = getnameinfo(addr->ifa_addr,
+                                     (family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)),
+                                     host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST);
+                if ( status != 0 ) {
+                    freeifaddrs( addrs );
+                    addrs = NULL;
+                    msgasserted( 13470, string("getnameinfo() failed: ") + gai_strerror(status) );
+                }
+
+                out.push_back(host);
+            }
+
+        }
+
+        freeifaddrs( addrs );
+        addrs = NULL;
+
+        if (logLevel >= 1) {
+            log(1) << "getMyAddrs():";
+            for (vector<string>::const_iterator it=out.begin(), end=out.end(); it!=end; ++it) {
+                log(1) << " [" << *it << ']';
+            }
+            log(1) << endl;
+        }
+
+        return out;
+    }
+
+    vector<string> getAllIPs(StringData iporhost) {
+        addrinfo* addrs = NULL;
+        addrinfo hints;
+        memset(&hints, 0, sizeof(addrinfo));
+        hints.ai_socktype = SOCK_STREAM;
+        hints.ai_family = (IPv6Enabled() ? AF_UNSPEC : AF_INET);
+
+        static string portNum = BSONObjBuilder::numStr(cmdLine.port);
+
+        vector<string> out;
+
+        int ret = getaddrinfo(iporhost.data(), portNum.c_str(), &hints, &addrs);
+        if ( ret ) {
+            warning() << "getaddrinfo(\"" << iporhost.data() << "\") failed: " << gai_strerror(ret) << endl;
+            return out;
+        }
+
+        for (addrinfo* addr = addrs; addr != NULL; addr = addr->ai_next) {
+            int family = addr->ai_family;
+            char host[NI_MAXHOST];
+
+            if (family == AF_INET || family == AF_INET6) {
+                int status = getnameinfo(addr->ai_addr, addr->ai_addrlen, host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST);
+
+                massert(13472, string("getnameinfo() failed: ") + gai_strerror(status), status == 0);
+
+                out.push_back(host);
+            }
+
+        }
+
+        freeaddrinfo(addrs);
+
+        if (logLevel >= 1) {
+            log(1) << "getallIPs(\"" << iporhost << "\"):";
+            for (vector<string>::const_iterator it=out.begin(), end=out.end(); it!=end; ++it) {
+                log(1) << " [" << *it << ']';
+            }
+            log(1) << endl;
+        }
+
+        return out;
+    }
+#endif
+
+
+    class IsSelfCommand : public Command {
+    public:
+        IsSelfCommand() : Command("_isSelf") , _cacheLock( "IsSelfCommand::_cacheLock" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "{ _isSelf : 1 } INTERNAL ONLY";
+        }
+
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            init();
+            result.append( "id" , _id );
+            return true;
+        }
+
+        void init() {
+            scoped_lock lk( _cacheLock );
+            if ( ! _id.isSet() )
+                _id.init();
+        }
+
+        OID _id;
+
+        mongo::mutex _cacheLock;
+        map<string,bool> _cache;
+    } isSelfCommand;
+
+    bool HostAndPort::isSelf() const {
+
+        if( dyn() ) { 
+            LOG(2) << "isSelf " << _dynName << ' ' << dynHostMyName() << endl;
+            return dynHostMyName() == _dynName;
+        }
+
+        int _p = port();
+        int p = _p == -1 ? CmdLine::DefaultDBPort : _p;
+
+        if( p != cmdLine.port ) {
+            // shortcut - ports have to match at the very least
+            return false;
+        }
+
+        string host = str::stream() << this->host() << ":" << p;
+
+        {
+            // check cache for this host
+            // debatably something _could_ change, but I'm not sure right now (erh 10/14/2010)
+            scoped_lock lk( isSelfCommand._cacheLock );
+            map<string,bool>::const_iterator i = isSelfCommand._cache.find( host );
+            if ( i != isSelfCommand._cache.end() )
+                return i->second;
+        }
+
+#if !defined(_WIN32) && !defined(__sunos__)
+        // on linux and os x we can do a quick check for an ip match
+
+        const vector<string> myaddrs = getMyAddrs();
+        const vector<string> addrs = getAllIPs(_host);
+
+        for (vector<string>::const_iterator i=myaddrs.begin(), iend=myaddrs.end(); i!=iend; ++i) {
+            for (vector<string>::const_iterator j=addrs.begin(), jend=addrs.end(); j!=jend; ++j) {
+                string a = *i;
+                string b = *j;
+
+                if ( a == b ||
+                        ( str::startsWith( a , "127." ) && str::startsWith( b , "127." ) )  // 127. is all loopback
+                   ) {
+
+                    // add to cache
+                    scoped_lock lk( isSelfCommand._cacheLock );
+                    isSelfCommand._cache[host] = true;
+                    return true;
+                }
+            }
+        }
+
+#endif
+
+        if ( ! Listener::getTimeTracker() ) {
+            // this ensures we are actually running a server
+            // this may return true later, so may want to retry
+            return false;
+        }
+
+        try {
+            isSelfCommand.init();
+            DBClientConnection conn;
+            string errmsg;
+            if ( ! conn.connect( host , errmsg ) ) {
+                // should this go in the cache?
+                return false;
+            }
+
+            if (!noauth && cmdLine.keyFile &&
+                !conn.auth("local", internalSecurity.user, internalSecurity.pwd, errmsg, false)) {
+                return false;
+            }
+
+            BSONObj out;
+            bool ok = conn.simpleCommand( "admin" , &out , "_isSelf" );
+            bool me = ok && out["id"].type() == jstOID && isSelfCommand._id == out["id"].OID();
+
+            // add to cache
+            scoped_lock lk( isSelfCommand._cacheLock );
+            isSelfCommand._cache[host] = me;
+
+            return me;
+        }
+        catch ( std::exception& e ) {
+            warning() << "could't check isSelf (" << host << ") " << e.what() << endl;
+        }
+
+        return false;
+    }
+
+}
diff --git a/src/mongo/db/commands/mr.cpp b/src/mongo/db/commands/mr.cpp
new file mode 100644
index 00000000000..add76c39c47
--- /dev/null
+++ b/src/mongo/db/commands/mr.cpp
@@ -0,0 +1,1317 @@
+// mr.cpp
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../db.h"
+#include "../instance.h"
+#include "../commands.h"
+#include "../../scripting/engine.h"
+#include "../../client/dbclient.h"
+#include "../../client/connpool.h"
+#include "../../client/parallel.h"
+#include "../queryoptimizer.h"
+#include "../matcher.h"
+#include "../clientcursor.h"
+#include "../replutil.h"
+#include "../../s/d_chunk_manager.h"
+#include "../../s/d_logic.h"
+#include "../../s/grid.h"
+
+#include "mr.h"
+
+namespace mongo {
+
+    namespace mr {
+
+        AtomicUInt Config::JOB_NUMBER;
+
+        JSFunction::JSFunction( string type , const BSONElement& e ) {
+            _type = type;
+            _code = e._asCode();
+
+            if ( e.type() == CodeWScope )
+                _wantedScope = e.codeWScopeObject();
+        }
+
+        void JSFunction::init( State * state ) {
+            _scope = state->scope();
+            assert( _scope );
+            _scope->init( &_wantedScope );
+
+            _func = _scope->createFunction( _code.c_str() );
+            uassert( 13598 , str::stream() << "couldn't compile code for: " << _type , _func );
+
+            // install in JS scope so that it can be called in JS mode
+            _scope->setFunction(_type.c_str(), _code.c_str());
+        }
+
+        void JSMapper::init( State * state ) {
+            _func.init( state );
+            _params = state->config().mapParams;
+        }
+
+        /**
+         * Applies the map function to an object, which should internally call emit()
+         */
+        void JSMapper::map( const BSONObj& o ) {
+            Scope * s = _func.scope();
+            assert( s );
+            if ( s->invoke( _func.func() , &_params, &o , 0 , true, false, true ) )
+                throw UserException( 9014, str::stream() << "map invoke failed: " + s->getError() );
+        }
+
+        /**
+         * Applies the finalize function to a tuple obj (key, val)
+         * Returns tuple obj {_id: key, value: newval}
+         */
+        BSONObj JSFinalizer::finalize( const BSONObj& o ) {
+            Scope * s = _func.scope();
+
+            Scope::NoDBAccess no = s->disableDBAccess( "can't access db inside finalize" );
+            s->invokeSafe( _func.func() , &o, 0 );
+
+            // don't want to use o.objsize() to size b
+            // since there are many cases where the point of finalize
+            // is converting many fields to 1
+            BSONObjBuilder b;
+            b.append( o.firstElement() );
+            s->append( b , "value" , "return" );
+            return b.obj();
+        }
+
+        void JSReducer::init( State * state ) {
+            _func.init( state );
+        }
+
+        /**
+         * Reduces a list of tuple objects (key, value) to a single tuple {"0": key, "1": value}
+         */
+        BSONObj JSReducer::reduce( const BSONList& tuples ) {
+            if (tuples.size() <= 1)
+                return tuples[0];
+            BSONObj key;
+            int endSizeEstimate = 16;
+            _reduce( tuples , key , endSizeEstimate );
+
+            BSONObjBuilder b(endSizeEstimate);
+            b.appendAs( key.firstElement() , "0" );
+            _func.scope()->append( b , "1" , "return" );
+            return b.obj();
+        }
+
+        /**
+         * Reduces a list of tuple object (key, value) to a single tuple {_id: key, value: val}
+         * Also applies a finalizer method if present.
+         */
+        BSONObj JSReducer::finalReduce( const BSONList& tuples , Finalizer * finalizer ) {
+
+            BSONObj res;
+            BSONObj key;
+
+            if (tuples.size() == 1) {
+                // 1 obj, just use it
+                key = tuples[0];
+                BSONObjBuilder b(key.objsize());
+                BSONObjIterator it(key);
+                b.appendAs( it.next() , "_id" );
+                b.appendAs( it.next() , "value" );
+                res = b.obj();
+            }
+            else {
+                // need to reduce
+                int endSizeEstimate = 16;
+                _reduce( tuples , key , endSizeEstimate );
+                BSONObjBuilder b(endSizeEstimate);
+                b.appendAs( key.firstElement() , "_id" );
+                _func.scope()->append( b , "value" , "return" );
+                res = b.obj();
+            }
+
+            if ( finalizer ) {
+                res = finalizer->finalize( res );
+            }
+
+            return res;
+        }
+
+        /**
+         * actually applies a reduce, to a list of tuples (key, value).
+         * After the call, tuples will hold a single tuple {"0": key, "1": value}
+         */
+        void JSReducer::_reduce( const BSONList& tuples , BSONObj& key , int& endSizeEstimate ) {
+            uassert( 10074 ,  "need values" , tuples.size() );
+
+            int sizeEstimate = ( tuples.size() * tuples.begin()->getField( "value" ).size() ) + 128;
+
+            // need to build the reduce args: ( key, [values] )
+            BSONObjBuilder reduceArgs( sizeEstimate );
+            boost::scoped_ptr<BSONArrayBuilder>  valueBuilder;
+            int sizeSoFar = 0;
+            unsigned n = 0;
+            for ( ; n<tuples.size(); n++ ) {
+                BSONObjIterator j(tuples[n]);
+                BSONElement keyE = j.next();
+                if ( n == 0 ) {
+                    reduceArgs.append( keyE );
+                    key = keyE.wrap();
+                    sizeSoFar = 5 + keyE.size();
+                    valueBuilder.reset(new BSONArrayBuilder( reduceArgs.subarrayStart( "tuples" ) ));
+                }
+
+                BSONElement ee = j.next();
+
+                uassert( 13070 , "value too large to reduce" , ee.size() < ( BSONObjMaxUserSize / 2 ) );
+
+                if ( sizeSoFar + ee.size() > BSONObjMaxUserSize ) {
+                    assert( n > 1 ); // if not, inf. loop
+                    break;
+                }
+
+                valueBuilder->append( ee );
+                sizeSoFar += ee.size();
+            }
+            assert(valueBuilder);
+            valueBuilder->done();
+            BSONObj args = reduceArgs.obj();
+
+            Scope * s = _func.scope();
+
+            s->invokeSafe( _func.func() , &args, 0, 0, false, true, true );
+            ++numReduces;
+
+            if ( s->type( "return" ) == Array ) {
+                uasserted( 10075 , "reduce -> multiple not supported yet");
+                return;
+            }
+
+            endSizeEstimate = key.objsize() + ( args.objsize() / tuples.size() );
+
+            if ( n == tuples.size() )
+                return;
+
+            // the input list was too large, add the rest of elmts to new tuples and reduce again
+            // note: would be better to use loop instead of recursion to avoid stack overflow
+            BSONList x;
+            for ( ; n < tuples.size(); n++ ) {
+                x.push_back( tuples[n] );
+            }
+            BSONObjBuilder temp( endSizeEstimate );
+            temp.append( key.firstElement() );
+            s->append( temp , "1" , "return" );
+            x.push_back( temp.obj() );
+            _reduce( x , key , endSizeEstimate );
+        }
+
+        Config::Config( const string& _dbname , const BSONObj& cmdObj ) {
+
+            dbname = _dbname;
+            ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+            verbose = cmdObj["verbose"].trueValue();
+            jsMode = cmdObj["jsMode"].trueValue();
+            splitInfo = 0;
+            if (cmdObj.hasField("splitInfo"))
+                splitInfo = cmdObj["splitInfo"].Int();
+
+            jsMaxKeys = 500000;
+            reduceTriggerRatio = 10.0;
+            maxInMemSize = 500 * 1024;
+
+            uassert( 13602 , "outType is no longer a valid option" , cmdObj["outType"].eoo() );
+
+            if ( cmdObj["out"].type() == String ) {
+                finalShort = cmdObj["out"].String();
+                outType = REPLACE;
+            }
+            else if ( cmdObj["out"].type() == Object ) {
+                BSONObj o = cmdObj["out"].embeddedObject();
+
+                BSONElement e = o.firstElement();
+                string t = e.fieldName();
+
+                if ( t == "normal" || t == "replace" ) {
+                    outType = REPLACE;
+                    finalShort = e.String();
+                }
+                else if ( t == "merge" ) {
+                    outType = MERGE;
+                    finalShort = e.String();
+                }
+                else if ( t == "reduce" ) {
+                    outType = REDUCE;
+                    finalShort = e.String();
+                }
+                else if ( t == "inline" ) {
+                    outType = INMEMORY;
+                }
+                else {
+                    uasserted( 13522 , str::stream() << "unknown out specifier [" << t << "]" );
+                }
+
+                if (o.hasElement("db")) {
+                    outDB = o["db"].String();
+                }
+
+                if (o.hasElement("nonAtomic")) {
+                    outNonAtomic = o["nonAtomic"].Bool();
+                    if (outNonAtomic)
+                        uassert( 15895 , "nonAtomic option cannot be used with this output type", (outType == REDUCE || outType == MERGE) );
+                }
+            }
+            else {
+                uasserted( 13606 , "'out' has to be a string or an object" );
+            }
+
+            if ( outType != INMEMORY ) { // setup names
+                tempLong = str::stream() << (outDB.empty() ? dbname : outDB) << ".tmp.mr." << cmdObj.firstElement().String() << "_" << JOB_NUMBER++;
+
+                incLong = tempLong + "_inc";
+
+                finalLong = str::stream() << (outDB.empty() ? dbname : outDB) << "." << finalShort;
+            }
+
+            {
+                // scope and code
+
+                if ( cmdObj["scope"].type() == Object )
+                    scopeSetup = cmdObj["scope"].embeddedObjectUserCheck();
+
+                mapper.reset( new JSMapper( cmdObj["map"] ) );
+                reducer.reset( new JSReducer( cmdObj["reduce"] ) );
+                if ( cmdObj["finalize"].type() && cmdObj["finalize"].trueValue() )
+                    finalizer.reset( new JSFinalizer( cmdObj["finalize"] ) );
+
+                if ( cmdObj["mapparams"].type() == Array ) {
+                    mapParams = cmdObj["mapparams"].embeddedObjectUserCheck();
+                }
+
+            }
+
+            {
+                // query options
+                BSONElement q = cmdObj["query"];
+                if ( q.type() == Object )
+                    filter = q.embeddedObjectUserCheck();
+                else
+                    uassert( 13608 , "query has to be blank or an Object" , ! q.trueValue() );
+
+
+                BSONElement s = cmdObj["sort"];
+                if ( s.type() == Object )
+                    sort = s.embeddedObjectUserCheck();
+                else
+                    uassert( 13609 , "sort has to be blank or an Object" , ! s.trueValue() );
+
+                if ( cmdObj["limit"].isNumber() )
+                    limit = cmdObj["limit"].numberLong();
+                else
+                    limit = 0;
+            }
+        }
+
+        /**
+         * Create temporary collection, set up indexes
+         */
+        void State::prepTempCollection() {
+            if ( ! _onDisk )
+                return;
+
+            if (_config.incLong != _config.tempLong) {
+                // create the inc collection and make sure we have index on "0" key
+                _db.dropCollection( _config.incLong );
+                {
+                    writelock l( _config.incLong );
+                    Client::Context ctx( _config.incLong );
+                    string err;
+                    if ( ! userCreateNS( _config.incLong.c_str() , BSON( "autoIndexId" << 0 ) , err , false ) ) {
+                        uasserted( 13631 , str::stream() << "userCreateNS failed for mr incLong ns: " << _config.incLong << " err: " << err );
+                    }
+                }
+
+                BSONObj sortKey = BSON( "0" << 1 );
+                _db.ensureIndex( _config.incLong , sortKey );
+            }
+
+            // create temp collection
+            _db.dropCollection( _config.tempLong );
+            {
+                writelock lock( _config.tempLong.c_str() );
+                Client::Context ctx( _config.tempLong.c_str() );
+                string errmsg;
+                if ( ! userCreateNS( _config.tempLong.c_str() , BSONObj() , errmsg , true ) ) {
+                    uasserted( 13630 , str::stream() << "userCreateNS failed for mr tempLong ns: " << _config.tempLong << " err: " << errmsg );
+                }
+            }
+
+            {
+                // copy indexes
+                auto_ptr<DBClientCursor> idx = _db.getIndexes( _config.finalLong );
+                while ( idx->more() ) {
+                    BSONObj i = idx->next();
+
+                    BSONObjBuilder b( i.objsize() + 16 );
+                    b.append( "ns" , _config.tempLong );
+                    BSONObjIterator j( i );
+                    while ( j.more() ) {
+                        BSONElement e = j.next();
+                        if ( str::equals( e.fieldName() , "_id" ) ||
+                                str::equals( e.fieldName() , "ns" ) )
+                            continue;
+
+                        b.append( e );
+                    }
+
+                    BSONObj indexToInsert = b.obj();
+                    insert( Namespace( _config.tempLong.c_str() ).getSisterNS( "system.indexes" ).c_str() , indexToInsert );
+                }
+
+            }
+
+        }
+
+        /**
+         * For inline mode, appends results to output object.
+         * Makes sure (key, value) tuple is formatted as {_id: key, value: val}
+         */
+        void State::appendResults( BSONObjBuilder& final ) {
+            if ( _onDisk ) {
+                if (!_config.outDB.empty()) {
+                    BSONObjBuilder loc;
+                    if ( !_config.outDB.empty())
+                        loc.append( "db" , _config.outDB );
+                    if ( !_config.finalShort.empty() )
+                        loc.append( "collection" , _config.finalShort );
+                    final.append("result", loc.obj());
+                }
+                else {
+                    if ( !_config.finalShort.empty() )
+                        final.append( "result" , _config.finalShort );
+                }
+
+                if ( _config.splitInfo > 0 ) {
+                    // add split points, used for shard
+                    BSONObj res;
+                    BSONObj idKey = BSON( "_id" << 1 );
+                    if ( ! _db.runCommand( "admin" , BSON( "splitVector" << _config.finalLong << "keyPattern" << idKey << "maxChunkSizeBytes" << _config.splitInfo ) , res ) ) {
+                        uasserted( 15921 ,  str::stream() << "splitVector failed: " << res );
+                    }
+                    if ( res.hasField( "splitKeys" ) )
+                        final.append( res.getField( "splitKeys" ) );
+                }
+                return;
+            }
+
+            if (_jsMode) {
+                ScriptingFunction getResult = _scope->createFunction("var map = _mrMap; var result = []; for (key in map) { result.push({_id: key, value: map[key]}) } return result;");
+                _scope->invoke(getResult, 0, 0, 0, false);
+                BSONObj obj = _scope->getObject("return");
+                final.append("results", BSONArray(obj));
+                return;
+            }
+
+            uassert( 13604 , "too much data for in memory map/reduce" , _size < BSONObjMaxUserSize );
+
+            BSONArrayBuilder b( (int)(_size * 1.2) ); // _size is data size, doesn't count overhead and keys
+
+            for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) {
+                BSONObj key = i->first;
+                BSONList& all = i->second;
+
+                assert( all.size() == 1 );
+
+                BSONObjIterator vi( all[0] );
+                vi.next();
+
+                BSONObjBuilder temp( b.subobjStart() );
+                temp.appendAs( key.firstElement() , "_id" );
+                temp.appendAs( vi.next() , "value" );
+                temp.done();
+            }
+
+            BSONArray res = b.arr();
+            final.append( "results" , res );
+        }
+
+        /**
+         * Does post processing on output collection.
+         * This may involve replacing, merging or reducing.
+         */
+        long long State::postProcessCollection(CurOp* op, ProgressMeterHolder& pm) {
+            if ( _onDisk == false || _config.outType == Config::INMEMORY )
+                return _temp->size();
+
+            if (_config.outNonAtomic)
+                return postProcessCollectionNonAtomic(op, pm);
+            writelock lock;
+            return postProcessCollectionNonAtomic(op, pm);
+        }
+
+        long long State::postProcessCollectionNonAtomic(CurOp* op, ProgressMeterHolder& pm) {
+
+            if ( _config.finalLong == _config.tempLong )
+                return _db.count( _config.finalLong );
+
+            if ( _config.outType == Config::REPLACE || _db.count( _config.finalLong ) == 0 ) {
+                writelock lock;
+                // replace: just rename from temp to final collection name, dropping previous collection
+                _db.dropCollection( _config.finalLong );
+                BSONObj info;
+                if ( ! _db.runCommand( "admin" , BSON( "renameCollection" << _config.tempLong << "to" << _config.finalLong ) , info ) ) {
+                    uasserted( 10076 ,  str::stream() << "rename failed: " << info );
+                }
+                         
+                _db.dropCollection( _config.tempLong );
+            }
+            else if ( _config.outType == Config::MERGE ) {
+                // merge: upsert new docs into old collection
+                op->setMessage( "m/r: merge post processing" , _db.count( _config.tempLong, BSONObj() ) );
+                auto_ptr<DBClientCursor> cursor = _db.query( _config.tempLong , BSONObj() );
+                while ( cursor->more() ) {
+                    writelock lock;
+                    BSONObj o = cursor->next();
+                    Helpers::upsert( _config.finalLong , o );
+                    getDur().commitIfNeeded();
+                    pm.hit();
+                }
+                _db.dropCollection( _config.tempLong );
+                pm.finished();
+            }
+            else if ( _config.outType == Config::REDUCE ) {
+                // reduce: apply reduce op on new result and existing one
+                BSONList values;
+
+                op->setMessage( "m/r: reduce post processing" , _db.count( _config.tempLong, BSONObj() ) );
+                auto_ptr<DBClientCursor> cursor = _db.query( _config.tempLong , BSONObj() );
+                while ( cursor->more() ) {
+                    writelock lock;
+                    BSONObj temp = cursor->next();
+                    BSONObj old;
+
+                    bool found;
+                    {
+                        Client::Context tx( _config.finalLong );
+                        found = Helpers::findOne( _config.finalLong.c_str() , temp["_id"].wrap() , old , true );
+                    }
+
+                    if ( found ) {
+                        // need to reduce
+                        values.clear();
+                        values.push_back( temp );
+                        values.push_back( old );
+                        Helpers::upsert( _config.finalLong , _config.reducer->finalReduce( values , _config.finalizer.get() ) );
+                    }
+                    else {
+                        Helpers::upsert( _config.finalLong , temp );
+                    }
+                    getDur().commitIfNeeded();
+                    pm.hit();
+                }
+                _db.dropCollection( _config.tempLong );
+                pm.finished();
+            }
+
+            return _db.count( _config.finalLong );
+        }
+
+        /**
+         * Insert doc in collection
+         */
+        void State::insert( const string& ns , const BSONObj& o ) {
+            assert( _onDisk );
+
+            writelock l( ns );
+            Client::Context ctx( ns );
+
+            theDataFileMgr.insertAndLog( ns.c_str() , o , false );
+        }
+
+        /**
+         * Insert doc into the inc collection, taking proper lock
+         */
+        void State::insertToInc( BSONObj& o ) {
+            writelock l(_config.incLong);
+            Client::Context ctx(_config.incLong);
+            _insertToInc(o);
+        }
+
+        /**
+         * Insert doc into the inc collection
+         */
+        void State::_insertToInc( BSONObj& o ) {
+            assert( _onDisk );
+            theDataFileMgr.insertWithObjMod( _config.incLong.c_str() , o , true );
+            getDur().commitIfNeeded();
+        }
+
+        State::State( const Config& c ) : _config( c ), _size(0), _dupCount(0), _numEmits(0) {
+            _temp.reset( new InMemory() );
+            _onDisk = _config.outType != Config::INMEMORY;
+        }
+
+        bool State::sourceExists() {
+            return _db.exists( _config.ns );
+        }
+
+        long long State::incomingDocuments() {
+            return _db.count( _config.ns , _config.filter , QueryOption_SlaveOk , (unsigned) _config.limit );
+        }
+
+        State::~State() {
+            if ( _onDisk ) {
+                try {
+                    _db.dropCollection( _config.tempLong );
+                    _db.dropCollection( _config.incLong );
+                }
+                catch ( std::exception& e ) {
+                    error() << "couldn't cleanup after map reduce: " << e.what() << endl;
+                }
+            }
+
+            if (_scope) {
+                // cleanup js objects
+                ScriptingFunction cleanup = _scope->createFunction("delete _emitCt; delete _keyCt; delete _mrMap;");
+                _scope->invoke(cleanup, 0, 0, 0, true);
+            }
+        }
+
+        /**
+         * Initialize the mapreduce operation, creating the inc collection
+         */
+        void State::init() {
+            // setup js
+            _scope.reset(globalScriptEngine->getPooledScope( _config.dbname ).release() );
+            _scope->localConnect( _config.dbname.c_str() );
+
+            if ( ! _config.scopeSetup.isEmpty() )
+                _scope->init( &_config.scopeSetup );
+
+            _config.mapper->init( this );
+            _config.reducer->init( this );
+            if ( _config.finalizer )
+                _config.finalizer->init( this );
+            _scope->setBoolean("_doFinal", _config.finalizer);
+
+            // by default start in JS mode, will be faster for small jobs
+            _jsMode = _config.jsMode;
+//            _jsMode = true;
+            switchMode(_jsMode);
+
+            // global JS map/reduce hashmap
+            // we use a standard JS object which means keys are only simple types
+            // we could also add a real hashmap from a library, still we need to add object comparison methods
+//            _scope->setObject("_mrMap", BSONObj(), false);
+            ScriptingFunction init = _scope->createFunction("_emitCt = 0; _keyCt = 0; _dupCt = 0; _redCt = 0; if (typeof(_mrMap) === 'undefined') { _mrMap = {}; }");
+            _scope->invoke(init, 0, 0, 0, true);
+
+            // js function to run reduce on all keys
+//            redfunc = _scope->createFunction("for (var key in hashmap) {  print('Key is ' + key); list = hashmap[key]; ret = reduce(key, list); print('Value is ' + ret); };");
+            _reduceAll = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length != 1) { ret = _reduce(key, list); map[key] = [ret]; ++_redCt; } } _dupCt = 0;");
+            _reduceAndEmit = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; } emit(key, ret); }; delete _mrMap;");
+            _reduceAndFinalize = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { if (!_doFinal) {continue;} ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; }; if (_doFinal){ ret = _finalize(key, ret); } map[key] = ret; }");
+            _reduceAndFinalizeAndInsert = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; }; if (_doFinal){ ret = _finalize(key, ret); } _nativeToTemp({_id: key, value: ret}); }");
+
+        }
+
+        void State::switchMode(bool jsMode) {
+            _jsMode = jsMode;
+            if (jsMode) {
+                // emit function that stays in JS
+                _scope->setFunction("emit", "function(key, value) { if (typeof(key) === 'object') { _bailFromJS(key, value); return; }; ++_emitCt; var map = _mrMap; var list = map[key]; if (!list) { ++_keyCt; list = []; map[key] = list; } else { ++_dupCt; } list.push(value); }");
+                _scope->injectNative("_bailFromJS", _bailFromJS, this);
+            }
+            else {
+                // emit now populates C++ map
+                _scope->injectNative( "emit" , fast_emit, this );
+            }
+        }
+
+        void State::bailFromJS() {
+            log(1) << "M/R: Switching from JS mode to mixed mode" << endl;
+
+            // reduce and reemit into c++
+            switchMode(false);
+            _scope->invoke(_reduceAndEmit, 0, 0, 0, true);
+            // need to get the real number emitted so far
+            _numEmits = _scope->getNumberInt("_emitCt");
+            _config.reducer->numReduces = _scope->getNumberInt("_redCt");
+        }
+
+        /**
+         * Applies last reduce and finalize on a list of tuples (key, val)
+         * Inserts single result {_id: key, value: val} into temp collection
+         */
+        void State::finalReduce( BSONList& values ) {
+            if ( !_onDisk || values.size() == 0 )
+                return;
+
+            BSONObj res = _config.reducer->finalReduce( values , _config.finalizer.get() );
+            insert( _config.tempLong , res );
+        }
+
+        BSONObj _nativeToTemp( const BSONObj& args, void* data ) {
+            State* state = (State*) data;
+            BSONObjIterator it(args);
+            state->insert(state->_config.tempLong, it.next().Obj());
+            return BSONObj();
+        }
+
+//        BSONObj _nativeToInc( const BSONObj& args, void* data ) {
+//            State* state = (State*) data;
+//            BSONObjIterator it(args);
+//            const BSONObj& obj = it.next().Obj();
+//            state->_insertToInc(const_cast<BSONObj&>(obj));
+//            return BSONObj();
+//        }
+
+        /**
+         * Applies last reduce and finalize.
+         * After calling this method, the temp collection will be completed.
+         * If inline, the results will be in the in memory map
+         */
+        void State::finalReduce( CurOp * op , ProgressMeterHolder& pm ) {
+
+            if (_jsMode) {
+                // apply the reduce within JS
+                if (_onDisk) {
+                    _scope->injectNative("_nativeToTemp", _nativeToTemp, this);
+                    _scope->invoke(_reduceAndFinalizeAndInsert, 0, 0, 0, true);
+                    return;
+                }
+                else {
+                    _scope->invoke(_reduceAndFinalize, 0, 0, 0, true);
+                    return;
+                }
+            }
+
+            if ( ! _onDisk ) {
+                // all data has already been reduced, just finalize
+                if ( _config.finalizer ) {
+                    long size = 0;
+                    for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) {
+                        BSONObj key = i->first;
+                        BSONList& all = i->second;
+
+                        assert( all.size() == 1 );
+
+                        BSONObj res = _config.finalizer->finalize( all[0] );
+
+                        all.clear();
+                        all.push_back( res );
+                        size += res.objsize();
+                    }
+                    _size = size;
+                }
+                return;
+            }
+
+            // use index on "0" to pull sorted data
+            assert( _temp->size() == 0 );
+            BSONObj sortKey = BSON( "0" << 1 );
+            {
+                bool foundIndex = false;
+
+                auto_ptr<DBClientCursor> idx = _db.getIndexes( _config.incLong );
+                while ( idx.get() && idx->more() ) {
+                    BSONObj x = idx->next();
+                    if ( sortKey.woCompare( x["key"].embeddedObject() ) == 0 ) {
+                        foundIndex = true;
+                        break;
+                    }
+                }
+
+                assert( foundIndex );
+            }
+
+            readlock rl( _config.incLong.c_str() );
+            Client::Context ctx( _config.incLong );
+
+            BSONObj prev;
+            BSONList all;
+
+            assert( pm == op->setMessage( "m/r: (3/3) final reduce to collection" , _db.count( _config.incLong, BSONObj(), QueryOption_SlaveOk ) ) );
+
+            shared_ptr<Cursor> temp = bestGuessCursor( _config.incLong.c_str() , BSONObj() , sortKey );
+            auto_ptr<ClientCursor> cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , _config.incLong.c_str() ) );
+
+            // iterate over all sorted objects
+            while ( cursor->ok() ) {
+                BSONObj o = cursor->current().getOwned();
+                cursor->advance();
+
+                pm.hit();
+
+                if ( o.woSortOrder( prev , sortKey ) == 0 ) {
+                    // object is same as previous, add to array
+                    all.push_back( o );
+                    if ( pm->hits() % 1000 == 0 ) {
+                        if ( ! cursor->yield() ) {
+                            cursor.release();
+                            break;
+                        }
+                        killCurrentOp.checkForInterrupt();
+                    }
+                    continue;
+                }
+
+                ClientCursor::YieldLock yield (cursor.get());
+
+                try {
+                    // reduce a finalize array
+                    finalReduce( all );
+                }
+                catch (...) {
+                    yield.relock();
+                    cursor.release();
+                    throw;
+                }
+
+                all.clear();
+                prev = o;
+                all.push_back( o );
+
+                if ( ! yield.stillOk() ) {
+                    cursor.release();
+                    break;
+                }
+
+                killCurrentOp.checkForInterrupt();
+            }
+            
+            // we need to release here since we temp release below
+            cursor.release();
+
+            {
+                dbtempreleasecond tl;
+                if ( ! tl.unlocked() )
+                    log( LL_WARNING ) << "map/reduce can't temp release" << endl;
+                // reduce and finalize last array
+                finalReduce( all );
+            }
+
+            pm.finished();
+        }
+
+        /**
+         * Attempts to reduce objects in the memory map.
+         * A new memory map will be created to hold the results.
+         * If applicable, objects with unique key may be dumped to inc collection.
+         * Input and output objects are both {"0": key, "1": val}
+         */
+        void State::reduceInMemory() {
+
+            if (_jsMode) {
+                // in js mode the reduce is applied when writing to collection
+                return;
+            }
+
+            auto_ptr<InMemory> n( new InMemory() ); // for new data
+            long nSize = 0;
+            _dupCount = 0;
+
+            for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) {
+                BSONObj key = i->first;
+                BSONList& all = i->second;
+
+                if ( all.size() == 1 ) {
+                    // only 1 value for this key
+                    if ( _onDisk ) {
+                        // this key has low cardinality, so just write to collection
+                        writelock l(_config.incLong);
+                        Client::Context ctx(_config.incLong.c_str());
+                        _insertToInc( *(all.begin()) );
+                    }
+                    else {
+                        // add to new map
+                        _add( n.get() , all[0] , nSize );
+                    }
+                }
+                else if ( all.size() > 1 ) {
+                    // several values, reduce and add to map
+                    BSONObj res = _config.reducer->reduce( all );
+                    _add( n.get() , res , nSize );
+                }
+            }
+
+            // swap maps
+            _temp.reset( n.release() );
+            _size = nSize;
+        }
+
+        /**
+         * Dumps the entire in memory map to the inc collection.
+         */
+        void State::dumpToInc() {
+            if ( ! _onDisk )
+                return;
+
+            writelock l(_config.incLong);
+            Client::Context ctx(_config.incLong);
+
+            for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); i++ ) {
+                BSONList& all = i->second;
+                if ( all.size() < 1 )
+                    continue;
+
+                for ( BSONList::iterator j=all.begin(); j!=all.end(); j++ )
+                    _insertToInc( *j );
+            }
+            _temp->clear();
+            _size = 0;
+
+        }
+
+        /**
+         * Adds object to in memory map
+         */
+        void State::emit( const BSONObj& a ) {
+            _numEmits++;
+            _add( _temp.get() , a , _size );
+        }
+
+        void State::_add( InMemory* im, const BSONObj& a , long& size ) {
+            BSONList& all = (*im)[a];
+            all.push_back( a );
+            size += a.objsize() + 16;
+            if (all.size() > 1)
+                ++_dupCount;
+        }
+
+        /**
+         * this method checks the size of in memory map and potentially flushes to disk
+         */
+        void State::checkSize() {
+            if (_jsMode) {
+                // try to reduce if it is beneficial
+                int dupCt = _scope->getNumberInt("_dupCt");
+                int keyCt = _scope->getNumberInt("_keyCt");
+
+                if (keyCt > _config.jsMaxKeys) {
+                    // too many keys for JS, switch to mixed
+                    _bailFromJS(BSONObj(), this);
+                    // then fall through to check map size
+                }
+                else if (dupCt > (keyCt * _config.reduceTriggerRatio)) {
+                    // reduce now to lower mem usage
+                    Timer t;
+                    _scope->invoke(_reduceAll, 0, 0, 0, true);
+                    log(1) << "  MR - did reduceAll: keys=" << keyCt << " dups=" << dupCt << " newKeys=" << _scope->getNumberInt("_keyCt") << " time=" << t.millis() << "ms" << endl;
+                    return;
+                }
+            }
+
+            if (_jsMode)
+                return;
+
+            if (_size > _config.maxInMemSize || _dupCount > (_temp->size() * _config.reduceTriggerRatio)) {
+                // attempt to reduce in memory map, if memory is too high or we have many duplicates
+                long oldSize = _size;
+                Timer t;
+                reduceInMemory();
+                log(1) << "  MR - did reduceInMemory: size=" << oldSize << " dups=" << _dupCount << " newSize=" << _size << " time=" << t.millis() << "ms" << endl;
+
+                // if size is still high, or values are not reducing well, dump
+                if ( _onDisk && (_size > _config.maxInMemSize || _size > oldSize / 2) ) {
+                    dumpToInc();
+                    log(1) << "  MR - dumping to db" << endl;
+                }
+            }
+        }
+
+        /**
+         * emit that will be called by js function
+         */
+        BSONObj fast_emit( const BSONObj& args, void* data ) {
+            uassert( 10077 , "fast_emit takes 2 args" , args.nFields() == 2 );
+            uassert( 13069 , "an emit can't be more than half max bson size" , args.objsize() < ( BSONObjMaxUserSize / 2 ) );
+            
+            State* state = (State*) data;
+            if ( args.firstElement().type() == Undefined ) {
+                BSONObjBuilder b( args.objsize() );
+                b.appendNull( "" );
+                BSONObjIterator i( args );
+                i.next();
+                b.append( i.next() );
+                state->emit( b.obj() );
+            }
+            else {
+                state->emit( args );
+            }
+            return BSONObj();
+        }
+
+        /**
+         * function is called when we realize we cant use js mode for m/r on the 1st key
+         */
+        BSONObj _bailFromJS( const BSONObj& args, void* data ) {
+            State* state = (State*) data;
+            state->bailFromJS();
+
+            // emit this particular key if there is one
+            if (!args.isEmpty()) {
+                fast_emit(args, data);
+            }
+            return BSONObj();
+        }
+
+        /**
+         * This class represents a map/reduce command executed on a single server
+         */
+        class MapReduceCommand : public Command {
+        public:
+            MapReduceCommand() : Command("mapReduce", false, "mapreduce") {}
+
+            /* why !replset ?
+               bad things happen with --slave (i think because of this)
+            */
+            virtual bool slaveOk() const { return !replSet; }
+
+            virtual bool slaveOverrideOk() { return true; }
+
+            virtual void help( stringstream &help ) const {
+                help << "Run a map/reduce operation on the server.\n";
+                help << "Note this is used for aggregation, not querying, in MongoDB.\n";
+                help << "http://www.mongodb.org/display/DOCS/MapReduce";
+            }
+
+            virtual LockType locktype() const { return NONE; }
+
+            bool run(const string& dbname , BSONObj& cmd, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+                Timer t;
+                Client& client = cc();
+                CurOp * op = client.curop();
+
+                Config config( dbname , cmd );
+
+                log(1) << "mr ns: " << config.ns << endl;
+
+                bool shouldHaveData = false;
+
+                long long num = 0;
+                long long inReduce = 0;
+
+                BSONObjBuilder countsBuilder;
+                BSONObjBuilder timingBuilder;
+                State state( config );
+                if ( ! state.sourceExists() ) {
+                    errmsg = "ns doesn't exist";
+                    return false;
+                }
+
+                if (replSet && state.isOnDisk()) {
+                    // this means that it will be doing a write operation, make sure we are on Master
+                    // ideally this check should be in slaveOk(), but at that point config is not known
+                    if (!isMaster(dbname.c_str())) {
+                        errmsg = "not master";
+                        return false;
+                    }
+                }
+
+                if (state.isOnDisk() && !client.getAuthenticationInfo()->isAuthorized(dbname)) {
+                    errmsg = "read-only user cannot output mapReduce to collection, use inline instead";
+                    return false;
+                }
+
+                try {
+                    state.init();
+                    state.prepTempCollection();
+                    ProgressMeterHolder pm( op->setMessage( "m/r: (1/3) emit phase" , state.incomingDocuments() ) );
+
+                    wassert( config.limit < 0x4000000 ); // see case on next line to 32 bit unsigned
+                    long long mapTime = 0;
+                    {
+                        readlock lock( config.ns );
+                        Client::Context ctx( config.ns );
+
+                        ShardChunkManagerPtr chunkManager;
+                        if ( shardingState.needShardChunkManager( config.ns ) ) {
+                            chunkManager = shardingState.getShardChunkManager( config.ns );
+                        }
+
+                        // obtain cursor on data to apply mr to, sorted
+                        shared_ptr<Cursor> temp = NamespaceDetailsTransient::getCursor( config.ns.c_str(), config.filter, config.sort );
+                        uassert( 15876, str::stream() << "could not create cursor over " << config.ns << " for query : " << config.filter << " sort : " << config.sort, temp.get() );
+                        auto_ptr<ClientCursor> cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , config.ns.c_str() ) );
+                        uassert( 15877, str::stream() << "could not create client cursor over " << config.ns << " for query : " << config.filter << " sort : " << config.sort, cursor.get() );
+
+                        Timer mt;
+                        // go through each doc
+                        while ( cursor->ok() ) {
+                            if ( ! cursor->currentMatches() ) {
+                                cursor->advance();
+                                continue;
+                            }
+
+                            // make sure we dont process duplicates in case data gets moved around during map
+                            // TODO This won't actually help when data gets moved, it's to handle multikeys.
+                            if ( cursor->currentIsDup() ) {
+                                cursor->advance();
+                                continue;
+                            }
+                                                        
+                            BSONObj o = cursor->current();
+                            cursor->advance();
+
+                            // check to see if this is a new object we don't own yet
+                            // because of a chunk migration
+                            if ( chunkManager && ! chunkManager->belongsToMe( o ) )
+                                continue;
+
+                            // do map
+                            if ( config.verbose ) mt.reset();
+                            config.mapper->map( o );
+                            if ( config.verbose ) mapTime += mt.micros();
+
+                            num++;
+                            if ( num % 1000 == 0 ) {
+                                // try to yield lock regularly
+                                ClientCursor::YieldLock yield (cursor.get());
+                                Timer t;
+                                // check if map needs to be dumped to disk
+                                state.checkSize();
+                                inReduce += t.micros();
+
+                                if ( ! yield.stillOk() ) {
+                                    cursor.release();
+                                    break;
+                                }
+
+                                killCurrentOp.checkForInterrupt();
+                            }
+                            pm.hit();
+
+                            if ( config.limit && num >= config.limit )
+                                break;
+                        }
+                    }
+                    pm.finished();
+
+                    killCurrentOp.checkForInterrupt();
+                    // update counters
+                    countsBuilder.appendNumber( "input" , num );
+                    countsBuilder.appendNumber( "emit" , state.numEmits() );
+                    if ( state.numEmits() )
+                        shouldHaveData = true;
+
+                    timingBuilder.append( "mapTime" , mapTime / 1000 );
+                    timingBuilder.append( "emitLoop" , t.millis() );
+
+                    op->setMessage( "m/r: (2/3) final reduce in memory" );
+                    Timer t;
+                    // do reduce in memory
+                    // this will be the last reduce needed for inline mode
+                    state.reduceInMemory();
+                    // if not inline: dump the in memory map to inc collection, all data is on disk
+                    state.dumpToInc();
+                    // final reduce
+                    state.finalReduce( op , pm );
+                    inReduce += t.micros();
+                    countsBuilder.appendNumber( "reduce" , state.numReduces() );
+                    timingBuilder.append( "reduceTime" , inReduce / 1000 );
+                    timingBuilder.append( "mode" , state.jsMode() ? "js" : "mixed" );
+
+                    long long finalCount = state.postProcessCollection(op, pm);
+                    state.appendResults( result );
+
+                    timingBuilder.append( "total" , t.millis() );
+                    result.append( "timeMillis" , t.millis() );
+                    countsBuilder.appendNumber( "output" , finalCount );
+                    if ( config.verbose ) result.append( "timing" , timingBuilder.obj() );
+                    result.append( "counts" , countsBuilder.obj() );
+
+                    if ( finalCount == 0 && shouldHaveData ) {
+                        result.append( "cmd" , cmd );
+                        errmsg = "there were emits but no data!";
+                        return false;
+                    }
+
+                }
+                catch( SendStaleConfigException& e ){
+                    log() << "mr detected stale config, should retry" << causedBy(e) << endl;
+                    throw e;
+                }
+                // TODO:  The error handling code for queries is v. fragile,
+                // *requires* rethrow AssertionExceptions - should probably fix.
+                catch ( AssertionException& e ){
+                    log() << "mr failed, removing collection" << causedBy(e) << endl;
+                    throw e;
+                }
+                catch ( std::exception& e ){
+                    log() << "mr failed, removing collection" << causedBy(e) << endl;
+                    throw e;
+                }
+                catch ( ... ) {
+                    log() << "mr failed for unknown reason, removing collection" << endl;
+                    throw;
+                }
+
+                return true;
+            }
+
+        } mapReduceCommand;
+
+        /**
+         * This class represents a map/reduce command executed on the output server of a sharded env
+         */
+        class MapReduceFinishCommand : public Command {
+        public:
+            MapReduceFinishCommand() : Command( "mapreduce.shardedfinish" ) {}
+            virtual bool slaveOk() const { return !replSet; }
+            virtual bool slaveOverrideOk() { return true; }
+
+            virtual LockType locktype() const { return NONE; }
+            bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+                ShardedConnectionInfo::addHook();
+                // legacy name
+                string shardedOutputCollection = cmdObj["shardedOutputCollection"].valuestrsafe();
+                string inputNS = cmdObj["inputNS"].valuestrsafe();
+                if (inputNS.empty())
+                    inputNS = dbname + "." + shardedOutputCollection;
+
+                Client& client = cc();
+                CurOp * op = client.curop();
+
+                Config config( dbname , cmdObj.firstElement().embeddedObjectUserCheck() );
+                State state(config);
+                state.init();
+
+                // no need for incremental collection because records are already sorted
+                config.incLong = config.tempLong;
+
+                BSONObj shardCounts = cmdObj["shardCounts"].embeddedObjectUserCheck();
+                BSONObj counts = cmdObj["counts"].embeddedObjectUserCheck();
+
+                ProgressMeterHolder pm( op->setMessage( "m/r: merge sort and reduce" ) );
+                set<ServerAndQuery> servers;
+                vector< auto_ptr<DBClientCursor> > shardCursors;
+
+                {
+                    // parse per shard results
+                    BSONObjIterator i( shardCounts );
+                    while ( i.more() ) {
+                        BSONElement e = i.next();
+                        string shard = e.fieldName();
+//                        BSONObj res = e.embeddedObjectUserCheck();
+                        servers.insert( shard );
+                    }
+                }
+
+                state.prepTempCollection();
+
+                BSONList values;
+                if (!config.outDB.empty()) {
+                    BSONObjBuilder loc;
+                    if ( !config.outDB.empty())
+                        loc.append( "db" , config.outDB );
+                    if ( !config.finalShort.empty() )
+                        loc.append( "collection" , config.finalShort );
+                    result.append("result", loc.obj());
+                }
+                else {
+                    if ( !config.finalShort.empty() )
+                        result.append( "result" , config.finalShort );
+                }
+
+                // fetch result from other shards 1 chunk at a time
+                // it would be better to do just one big $or query, but then the sorting would not be efficient
+                string shardName = shardingState.getShardName();
+                DBConfigPtr confOut = grid.getDBConfig( dbname , false );
+                vector<ChunkPtr> chunks;
+                if ( confOut->isSharded(config.finalLong) ) {
+                    ChunkManagerPtr cm = confOut->getChunkManager( config.finalLong );
+                    const ChunkMap& chunkMap = cm->getChunkMap();
+                    for ( ChunkMap::const_iterator it = chunkMap.begin(); it != chunkMap.end(); ++it ) {
+                        ChunkPtr chunk = it->second;
+                        if (chunk->getShard().getName() == shardName) chunks.push_back(chunk);
+                    }
+                }
+
+                long long inputCount = 0;
+                unsigned int index = 0;
+                BSONObj query;
+                BSONArrayBuilder chunkSizes;
+                while (true) {
+                    ChunkPtr chunk;
+                    if (chunks.size() > 0) {
+                        chunk = chunks[index];
+                        BSONObjBuilder b;
+                        b.appendAs(chunk->getMin().firstElement(), "$gte");
+                        b.appendAs(chunk->getMax().firstElement(), "$lt");
+                        query = BSON("_id" << b.obj());
+//                        chunkSizes.append(min);
+                    }
+
+                    // reduce from each shard for a chunk
+                    BSONObj sortKey = BSON( "_id" << 1 );
+                    ParallelSortClusteredCursor cursor( servers , inputNS , Query( query ).sort( sortKey ) );
+                    cursor.init();
+                    int chunkSize = 0;
+
+                    while ( cursor.more() || !values.empty() ) {
+                        BSONObj t;
+                        if (cursor.more()) {
+                            t = cursor.next().getOwned();
+                            ++inputCount;
+
+                            if ( values.size() == 0 ) {
+                                values.push_back( t );
+                                continue;
+                            }
+
+                            if ( t.woSortOrder( *(values.begin()) , sortKey ) == 0 ) {
+                                values.push_back( t );
+                                continue;
+                            }
+                        }
+
+                        BSONObj res = config.reducer->finalReduce( values , config.finalizer.get());
+                        chunkSize += res.objsize();
+                        if (state.isOnDisk())
+                            state.insertToInc(res);
+                        else
+                            state.emit(res);
+                        values.clear();
+                        if (!t.isEmpty())
+                            values.push_back( t );
+                    }
+
+                    if (chunk) {
+                        chunkSizes.append(chunk->getMin());
+                        chunkSizes.append(chunkSize);
+                    }
+                    if (++index >= chunks.size())
+                        break;
+                }
+
+                result.append( "chunkSizes" , chunkSizes.arr() );
+
+                long long outputCount = state.postProcessCollection(op, pm);
+                state.appendResults( result );
+
+                BSONObjBuilder countsB(32);
+                countsB.append("input", inputCount);
+                countsB.append("reduce", state.numReduces());
+                countsB.append("output", outputCount);
+                result.append( "counts" , countsB.obj() );
+
+                return 1;
+            }
+        } mapReduceFinishCommand;
+
+    }
+
+}
+
diff --git a/src/mongo/db/commands/mr.h b/src/mongo/db/commands/mr.h
new file mode 100644
index 00000000000..592769d82da
--- /dev/null
+++ b/src/mongo/db/commands/mr.h
@@ -0,0 +1,319 @@
+// mr.h
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+namespace mongo {
+
+    namespace mr {
+
+        typedef vector<BSONObj> BSONList;
+
+        class State;
+
+        // ------------  function interfaces -----------
+
+        class Mapper : boost::noncopyable {
+        public:
+            virtual ~Mapper() {}
+            virtual void init( State * state ) = 0;
+
+            virtual void map( const BSONObj& o ) = 0;
+        };
+
+        class Finalizer : boost::noncopyable {
+        public:
+            virtual ~Finalizer() {}
+            virtual void init( State * state ) = 0;
+
+            /**
+             * this takes a tuple and returns a tuple
+             */
+            virtual BSONObj finalize( const BSONObj& tuple ) = 0;
+        };
+
+        class Reducer : boost::noncopyable {
+        public:
+            Reducer() : numReduces(0) {}
+            virtual ~Reducer() {}
+            virtual void init( State * state ) = 0;
+
+            virtual BSONObj reduce( const BSONList& tuples ) = 0;
+            /** this means its a final reduce, even if there is no finalizer */
+            virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer ) = 0;
+
+            long long numReduces;
+        };
+
+        // ------------  js function implementations -----------
+
+        /**
+         * used as a holder for Scope and ScriptingFunction
+         * visitor like pattern as Scope is gotten from first access
+         */
+        class JSFunction : boost::noncopyable {
+        public:
+            /**
+             * @param type (map|reduce|finalize)
+             */
+            JSFunction( string type , const BSONElement& e );
+            virtual ~JSFunction() {}
+
+            virtual void init( State * state );
+
+            Scope * scope() const { return _scope; }
+            ScriptingFunction func() const { return _func; }
+
+        private:
+            string _type;
+            string _code; // actual javascript code
+            BSONObj _wantedScope; // this is for CodeWScope
+
+            Scope * _scope; // this is not owned by us, and might be shared
+            ScriptingFunction _func;
+        };
+
+        class JSMapper : public Mapper {
+        public:
+            JSMapper( const BSONElement & code ) : _func( "_map" , code ) {}
+            virtual void map( const BSONObj& o );
+            virtual void init( State * state );
+
+        private:
+            JSFunction _func;
+            BSONObj _params;
+        };
+
+        class JSReducer : public Reducer {
+        public:
+            JSReducer( const BSONElement& code ) : _func( "_reduce" , code ) {}
+            virtual void init( State * state );
+
+            virtual BSONObj reduce( const BSONList& tuples );
+            virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer );
+
+        private:
+
+            /**
+             * result in "return"
+             * @param key OUT
+             * @param endSizeEstimate OUT
+            */
+            void _reduce( const BSONList& values , BSONObj& key , int& endSizeEstimate );
+
+            JSFunction _func;
+        };
+
+        class JSFinalizer : public Finalizer  {
+        public:
+            JSFinalizer( const BSONElement& code ) : _func( "_finalize" , code ) {}
+            virtual BSONObj finalize( const BSONObj& o );
+            virtual void init( State * state ) { _func.init( state ); }
+        private:
+            JSFunction _func;
+
+        };
+
+        // -----------------
+
+
+        class TupleKeyCmp {
+        public:
+            TupleKeyCmp() {}
+            bool operator()( const BSONObj &l, const BSONObj &r ) const {
+                return l.firstElement().woCompare( r.firstElement() ) < 0;
+            }
+        };
+
+        typedef map< BSONObj,BSONList,TupleKeyCmp > InMemory; // from key to list of tuples
+
+        /**
+         * holds map/reduce config information
+         */
+        class Config {
+        public:
+            Config( const string& _dbname , const BSONObj& cmdObj );
+
+            string dbname;
+            string ns;
+
+            // options
+            bool verbose;
+            bool jsMode;
+            int splitInfo;
+
+            // query options
+
+            BSONObj filter;
+            BSONObj sort;
+            long long limit;
+
+            // functions
+
+            scoped_ptr<Mapper> mapper;
+            scoped_ptr<Reducer> reducer;
+            scoped_ptr<Finalizer> finalizer;
+
+            BSONObj mapParams;
+            BSONObj scopeSetup;
+
+            // output tables
+            string incLong;
+            string tempLong;
+
+            string finalShort;
+            string finalLong;
+
+            string outDB;
+
+            // max number of keys allowed in JS map before switching mode
+            long jsMaxKeys;
+            // ratio of duplicates vs unique keys before reduce is triggered in js mode
+            float reduceTriggerRatio;
+            // maximum size of map before it gets dumped to disk
+            long maxInMemSize;
+
+            enum { REPLACE , // atomically replace the collection
+                   MERGE ,  // merge keys, override dups
+                   REDUCE , // merge keys, reduce dups
+                   INMEMORY // only store in memory, limited in size
+                 } outType;
+
+            // if true, no lock during output operation
+            bool outNonAtomic;
+
+            static AtomicUInt JOB_NUMBER;
+        }; // end MRsetup
+
+        /**
+         * stores information about intermediate map reduce state
+         * controls flow of data from map->reduce->finalize->output
+         */
+        class State {
+        public:
+            State( const Config& c );
+            ~State();
+
+            void init();
+
+            // ---- prep  -----
+            bool sourceExists();
+
+            long long incomingDocuments();
+
+            // ---- map stage ----
+
+            /**
+             * stages on in in-memory storage
+             */
+            void emit( const BSONObj& a );
+
+            /**
+             * if size is big, run a reduce
+             * if its still big, dump to temp collection
+             */
+            void checkSize();
+
+            /**
+             * run reduce on _temp
+             */
+            void reduceInMemory();
+
+            /**
+             * transfers in memory storage to temp collection
+             */
+            void dumpToInc();
+            void insertToInc( BSONObj& o );
+            void _insertToInc( BSONObj& o );
+
+            // ------ reduce stage -----------
+
+            void prepTempCollection();
+
+            void finalReduce( BSONList& values );
+
+            void finalReduce( CurOp * op , ProgressMeterHolder& pm );
+
+            // ------- cleanup/data positioning ----------
+
+            /**
+               @return number objects in collection
+             */
+            long long postProcessCollection( CurOp* op , ProgressMeterHolder& pm );
+            long long postProcessCollectionNonAtomic( CurOp* op , ProgressMeterHolder& pm );
+
+            /**
+             * if INMEMORY will append
+             * may also append stats or anything else it likes
+             */
+            void appendResults( BSONObjBuilder& b );
+
+            // -------- util ------------
+
+            /**
+             * inserts with correct replication semantics
+             */
+            void insert( const string& ns , const BSONObj& o );
+
+            // ------ simple accessors -----
+
+            /** State maintains ownership, do no use past State lifetime */
+            Scope* scope() { return _scope.get(); }
+
+            const Config& config() { return _config; }
+
+            const bool isOnDisk() { return _onDisk; }
+
+            long long numEmits() const { if (_jsMode) return _scope->getNumberLongLong("_emitCt"); return _numEmits; }
+            long long numReduces() const { if (_jsMode) return _scope->getNumberLongLong("_redCt"); return _config.reducer->numReduces; }
+
+            bool jsMode() {return _jsMode;}
+            void switchMode(bool jsMode);
+            void bailFromJS();
+
+            const Config& _config;
+            DBDirectClient _db;
+
+        protected:
+
+            void _add( InMemory* im , const BSONObj& a , long& size );
+
+            scoped_ptr<Scope> _scope;
+            bool _onDisk; // if the end result of this map reduce is disk or not
+
+            scoped_ptr<InMemory> _temp;
+            long _size; // bytes in _temp
+            long _dupCount; // number of duplicate key entries
+
+            long long _numEmits;
+
+            bool _jsMode;
+            ScriptingFunction _reduceAll;
+            ScriptingFunction _reduceAndEmit;
+            ScriptingFunction _reduceAndFinalize;
+            ScriptingFunction _reduceAndFinalizeAndInsert;
+        };
+
+        BSONObj fast_emit( const BSONObj& args, void* data );
+        BSONObj _bailFromJS( const BSONObj& args, void* data );
+
+    } // end mr namespace
+}
+
+
diff --git a/src/mongo/db/commands/pipeline.cpp b/src/mongo/db/commands/pipeline.cpp
new file mode 100755
index 00000000000..4ad5e342aed
--- /dev/null
+++ b/src/mongo/db/commands/pipeline.cpp
@@ -0,0 +1,405 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/commands/pipeline.h"
+
+#include "db/cursor.h"
+#include "db/pipeline/accumulator.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/document_source.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pdfile.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+
+    const char Pipeline::commandName[] = "aggregate";
+    const char Pipeline::pipelineName[] = "pipeline";
+    const char Pipeline::fromRouterName[] = "fromRouter";
+    const char Pipeline::splitMongodPipelineName[] = "splitMongodPipeline";
+
+    Pipeline::~Pipeline() {
+    }
+
+    Pipeline::Pipeline(const intrusive_ptr<ExpressionContext> &pTheCtx):
+	collectionName(),
+	sourceVector(),
+        splitMongodPipeline(DEBUG_BUILD == 1), /* test: always split for DEV */
+        pCtx(pTheCtx) {
+    }
+
+
+
+    /* this structure is used to make a lookup table of operators */
+    struct StageDesc {
+	const char *pName;
+	intrusive_ptr<DocumentSource> (*pFactory)(
+	    BSONElement *, const intrusive_ptr<ExpressionContext> &);
+    };
+
+    /* this table must be in alphabetical order by name for bsearch() */
+    static const StageDesc stageDesc[] = {
+#ifdef NEVER /* disabled for now in favor of $match */
+	{DocumentSourceFilter::filterName,
+	 DocumentSourceFilter::createFromBson},
+#endif
+	{DocumentSourceGroup::groupName,
+	 DocumentSourceGroup::createFromBson},
+	{DocumentSourceLimit::limitName,
+	 DocumentSourceLimit::createFromBson},
+	{DocumentSourceMatch::matchName,
+	 DocumentSourceMatch::createFromBson},
+#ifdef LATER /* https://jira.mongodb.org/browse/SERVER-3253 */
+	{DocumentSourceOut::outName,
+	 DocumentSourceOut::createFromBson},
+#endif
+	{DocumentSourceProject::projectName,
+	 DocumentSourceProject::createFromBson},
+	{DocumentSourceSkip::skipName,
+	 DocumentSourceSkip::createFromBson},
+	{DocumentSourceSort::sortName,
+	 DocumentSourceSort::createFromBson},
+	{DocumentSourceUnwind::unwindName,
+	 DocumentSourceUnwind::createFromBson},
+    };
+    static const size_t nStageDesc = sizeof(stageDesc) / sizeof(StageDesc);
+
+    static int stageDescCmp(const void *pL, const void *pR) {
+	return strcmp(((const StageDesc *)pL)->pName,
+		      ((const StageDesc *)pR)->pName);
+    }
+
+    boost::shared_ptr<Pipeline> Pipeline::parseCommand(
+	string &errmsg, BSONObj &cmdObj,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	boost::shared_ptr<Pipeline> pPipeline(new Pipeline(pCtx));
+        vector<BSONElement> pipeline;
+
+        /* gather the specification for the aggregation */
+        for(BSONObj::iterator cmdIterator = cmdObj.begin();
+                cmdIterator.more(); ) {
+            BSONElement cmdElement(cmdIterator.next());
+            const char *pFieldName = cmdElement.fieldName();
+
+            /* look for the aggregation command */
+            if (!strcmp(pFieldName, commandName)) {
+                pPipeline->collectionName = cmdElement.String();
+                continue;
+            }
+
+            /* check for the collection name */
+            if (!strcmp(pFieldName, pipelineName)) {
+                pipeline = cmdElement.Array();
+                continue;
+            }
+
+	    /* if the request came from the router, we're in a shard */
+	    if (!strcmp(pFieldName, fromRouterName)) {
+		pCtx->setInShard(cmdElement.Bool());
+		continue;
+	    }
+
+	    /* check for debug options */
+	    if (!strcmp(pFieldName, splitMongodPipelineName)) {
+		pPipeline->splitMongodPipeline = true;
+		continue;
+	    }
+
+            /* we didn't recognize a field in the command */
+            ostringstream sb;
+            sb <<
+               "Pipeline::parseCommand(): unrecognized field \"" <<
+               cmdElement.fieldName();
+            errmsg = sb.str();
+	    return boost::shared_ptr<Pipeline>();
+        }
+
+        /*
+          If we get here, we've harvested the fields we expect for a pipeline.
+
+          Set up the specified document source pipeline.
+        */
+	SourceVector *pSourceVector = &pPipeline->sourceVector; // shorthand
+
+        /* iterate over the steps in the pipeline */
+        const size_t nSteps = pipeline.size();
+        for(size_t iStep = 0; iStep < nSteps; ++iStep) {
+            /* pull out the pipeline element as an object */
+            BSONElement pipeElement(pipeline[iStep]);
+	    uassert(15942, str::stream() << "pipeline element " <<
+		    iStep << " is not an object",
+		    pipeElement.type() == Object);
+            BSONObj bsonObj(pipeElement.Obj());
+
+	    intrusive_ptr<DocumentSource> pSource;
+
+            /* use the object to add a DocumentSource to the processing chain */
+            BSONObjIterator bsonIterator(bsonObj);
+            while(bsonIterator.more()) {
+                BSONElement bsonElement(bsonIterator.next());
+                const char *pFieldName = bsonElement.fieldName();
+
+                /* select the appropriate operation and instantiate */
+		StageDesc key;
+		key.pName = pFieldName;
+		const StageDesc *pDesc = (const StageDesc *)
+		    bsearch(&key, stageDesc, nStageDesc, sizeof(StageDesc),
+			    stageDescCmp);
+		if (pDesc)
+		    pSource = (*pDesc->pFactory)(&bsonElement, pCtx);
+                else {
+                    ostringstream sb;
+                    sb <<
+                       "Pipeline::run(): unrecognized pipeline op \"" <<
+                       pFieldName;
+                    errmsg = sb.str();
+		    return shared_ptr<Pipeline>();
+                }
+            }
+
+	    pSourceVector->push_back(pSource);
+        }
+
+	/* if there aren't any pipeline stages, there's nothing more to do */
+	if (!pSourceVector->size())
+	    return pPipeline;
+
+	/*
+	  Move filters up where possible.
+
+	  CW TODO -- move filter past projections where possible, and noting
+	  corresponding field renaming.
+	*/
+
+	/*
+	  Wherever there is a match immediately following a sort, swap them.
+	  This means we sort fewer items.  Neither changes the documents in
+	  the stream, so this transformation shouldn't affect the result.
+
+	  We do this first, because then when we coalesce operators below,
+	  any adjacent matches will be combined.
+	 */
+	for(size_t srcn = pSourceVector->size(), srci = 1;
+	    srci < srcn; ++srci) {
+	    intrusive_ptr<DocumentSource> &pSource = pSourceVector->at(srci);
+	    if (dynamic_cast<DocumentSourceMatch *>(pSource.get())) {
+		intrusive_ptr<DocumentSource> &pPrevious =
+		    pSourceVector->at(srci - 1);
+		if (dynamic_cast<DocumentSourceSort *>(pPrevious.get())) {
+		    /* swap this item with the previous */
+		    intrusive_ptr<DocumentSource> pTemp(pPrevious);
+		    pPrevious = pSource;
+		    pSource = pTemp;
+		}
+	    }
+	}
+
+	/*
+	  Coalesce adjacent filters where possible.  Two adjacent filters
+	  are equivalent to one filter whose predicate is the conjunction of
+	  the two original filters' predicates.  For now, capture this by
+	  giving any DocumentSource the option to absorb it's successor; this
+	  will also allow adjacent projections to coalesce when possible.
+
+	  Run through the DocumentSources, and give each one the opportunity
+	  to coalesce with its successor.  If successful, remove the
+	  successor.
+
+	  Move all document sources to a temporary list.
+	*/
+	SourceVector tempVector(*pSourceVector);
+	pSourceVector->clear();
+
+	/* move the first one to the final list */
+	pSourceVector->push_back(tempVector[0]);
+
+	/* run through the sources, coalescing them or keeping them */
+	for(size_t tempn = tempVector.size(), tempi = 1;
+	    tempi < tempn; ++tempi) {
+	    /*
+	      If we can't coalesce the source with the last, then move it
+	      to the final list, and make it the new last.  (If we succeeded,
+	      then we're still on the same last, and there's no need to move
+	      or do anything with the source -- the destruction of tempVector
+	      will take care of the rest.)
+	    */
+	    intrusive_ptr<DocumentSource> &pLastSource = pSourceVector->back();
+	    intrusive_ptr<DocumentSource> &pTemp = tempVector.at(tempi);
+	    if (!pLastSource->coalesce(pTemp))
+		pSourceVector->push_back(pTemp);
+	}
+
+	/* optimize the elements in the pipeline */
+	for(SourceVector::iterator iter(pSourceVector->begin()),
+		listEnd(pSourceVector->end()); iter != listEnd; ++iter)
+	    (*iter)->optimize();
+
+	return pPipeline;
+    }
+
+    shared_ptr<Pipeline> Pipeline::splitForSharded() {
+	/* create an initialize the shard spec we'll return */
+	shared_ptr<Pipeline> pShardPipeline(new Pipeline(pCtx));
+	pShardPipeline->collectionName = collectionName;
+
+	/* put the source list aside */
+	SourceVector tempVector(sourceVector);
+	sourceVector.clear();
+
+	/*
+	  Run through the pipeline, looking for points to split it into
+	  shard pipelines, and the rest.
+	 */
+	while(!tempVector.empty()) {
+	    intrusive_ptr<DocumentSource> &pSource = tempVector.front();
+
+#ifdef MONGODB_SERVER3832 /* see https://jira.mongodb.org/browse/SERVER-3832 */
+	    DocumentSourceSort *pSort =
+		dynamic_cast<DocumentSourceSort *>(pSource.get());
+	    if (pSort) {
+		/*
+		  There's no point in sorting until the result is combined.
+		  Therefore, sorts should be done in mongos, and not in
+		  the shard at all.  Add all the remaining operators to
+		  the mongos list and quit.
+
+		  TODO:  unless the sort key is the shard key.
+		  TODO:  we could also do a merge sort in mongos in the
+		  future, and split here.
+		*/
+		for(size_t tempn = tempVector.size(), tempi = 0;
+		    tempi < tempn; ++tempi)
+		    sourceVector.push_back(tempVector[tempi]);
+		break;
+	    }
+#endif
+
+	    /* hang on to this in advance, in case it is a group */
+	    DocumentSourceGroup *pGroup =
+		dynamic_cast<DocumentSourceGroup *>(pSource.get());
+
+	    /* move the source from the tempVector to the shard sourceVector */
+	    pShardPipeline->sourceVector.push_back(pSource);
+	    tempVector.erase(tempVector.begin());
+
+	    /*
+	      If we found a group, that's a split point.
+	     */
+	    if (pGroup) {
+		/* start this pipeline with the group merger */
+		sourceVector.push_back(pGroup->createMerger());
+
+		/* and then add everything that remains and quit */
+		for(size_t tempn = tempVector.size(), tempi = 0;
+		    tempi < tempn; ++tempi)
+		    sourceVector.push_back(tempVector[tempi]);
+		break;
+	    }
+	}
+
+	return pShardPipeline;
+    }
+
+    void Pipeline::getCursorMods(BSONObjBuilder *pQueryBuilder,
+	BSONObjBuilder *pSortBuilder) {
+	/* look for an initial $match */
+	if (!sourceVector.size())
+	    return;
+	const intrusive_ptr<DocumentSource> &pMC = sourceVector.front();
+	const DocumentSourceMatch *pMatch =
+	    dynamic_cast<DocumentSourceMatch *>(pMC.get());
+
+	if (pMatch) {
+	    /* build the query */
+	    pMatch->toMatcherBson(pQueryBuilder);
+
+	    /* remove the match from the pipeline */
+	    sourceVector.erase(sourceVector.begin());
+	}
+
+	/* look for an initial $sort */
+	if (!sourceVector.size())
+	    return;
+#ifdef MONGODB_SERVER3832 /* see https://jira.mongodb.org/browse/SERVER-3832 */
+	const intrusive_ptr<DocumentSource> &pSC = sourceVector.front();
+	const DocumentSourceSort *pSort = 
+	    dynamic_cast<DocumentSourceSort *>(pSC.get());
+
+	if (pSort) {
+	    /* build the sort key */
+	    pSort->sortKeyToBson(pSortBuilder, false);
+
+	    /* remove the sort from the pipeline */
+	    sourceVector.erase(sourceVector.begin());
+	}
+#endif
+    }
+
+    void Pipeline::toBson(BSONObjBuilder *pBuilder) const {
+	/* create an array out of the pipeline operations */
+	BSONArrayBuilder arrayBuilder;
+	for(SourceVector::const_iterator iter(sourceVector.begin()),
+		listEnd(sourceVector.end()); iter != listEnd; ++iter) {
+	    intrusive_ptr<DocumentSource> pSource(*iter);
+	    pSource->addToBsonArray(&arrayBuilder);
+	}
+
+	/* add the top-level items to the command */
+	pBuilder->append(commandName, getCollectionName());
+	pBuilder->append(pipelineName, arrayBuilder.arr());
+
+	bool btemp;
+	if ((btemp = getSplitMongodPipeline())) {
+	    pBuilder->append(splitMongodPipelineName, btemp);
+	}
+	if ((btemp = pCtx->getInRouter())) {
+	    pBuilder->append(fromRouterName, btemp);
+	}
+    }
+
+    bool Pipeline::run(BSONObjBuilder &result, string &errmsg,
+		       intrusive_ptr<DocumentSource> pSource) {
+	/* chain together the sources we found */
+	for(SourceVector::iterator iter(sourceVector.begin()),
+		listEnd(sourceVector.end()); iter != listEnd; ++iter) {
+	    intrusive_ptr<DocumentSource> pTemp(*iter);
+	    pTemp->setSource(pSource);
+	    pSource = pTemp;
+	}
+	/* pSource is left pointing at the last source in the chain */
+
+        /*
+          Iterate through the resulting documents, and add them to the result.
+        */
+        BSONArrayBuilder resultArray; // where we'll stash the results
+        for(bool hasDocument = !pSource->eof(); hasDocument;
+                hasDocument = pSource->advance()) {
+	    boost::intrusive_ptr<Document> pDocument(pSource->getCurrent());
+
+            /* add the document to the result set */
+            BSONObjBuilder documentBuilder;
+            pDocument->toBson(&documentBuilder);
+            resultArray.append(documentBuilder.done());
+        }
+
+        result.appendArray("result", resultArray.arr());
+
+        return true;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/commands/pipeline.h b/src/mongo/db/commands/pipeline.h
new file mode 100755
index 00000000000..ef9cc6afe51
--- /dev/null
+++ b/src/mongo/db/commands/pipeline.h
@@ -0,0 +1,183 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include "db/jsobj.h"
+#include "util/timer.h"
+#include "db/commands.h"
+
+namespace mongo {
+    class BSONObj;
+    class BSONObjBuilder;
+    class DocumentSource;
+    class DocumentSourceProject;
+    class Expression;
+    class ExpressionContext;
+    class ExpressionNary;
+    struct OpDesc; // local private struct
+
+    /** mongodb "commands" (sent via db.$cmd.findOne(...))
+        subclass to make a command.  define a singleton object for it.
+        */
+    class Pipeline :
+        boost::noncopyable {
+    public:
+        virtual ~Pipeline();
+
+	/*
+	  Create a pipeline from the command.
+
+	  @param errmsg where to write errors, if there are any
+	  @param cmdObj the command object sent from the client
+	  @returns the pipeline, if created, otherwise a NULL reference
+	 */
+	static boost::shared_ptr<Pipeline> parseCommand(
+	    string &errmsg, BSONObj &cmdObj,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+	/*
+	  Get the collection name from the command.
+
+	  @returns the collection name
+	*/
+	string getCollectionName() const;
+
+	/*
+	  Split the current Pipeline into a Pipeline for each shard, and
+	  a Pipeline that combines the results within mongos.
+
+	  This permanently alters this pipeline for the merging operation.
+
+	  @returns the Spec for the pipeline command that should be sent
+	    to the shards
+	*/
+	boost::shared_ptr<Pipeline> splitForSharded();
+
+	/*
+	  Get Cursor creation modifiers.
+
+	  If we have a $match or a $sort at the beginning of the pipeline,
+	  these can be extracted and used to modify the cursor we'll use for
+	  the initial collection scan.
+
+	  If there is a Matcher query at the beginning of the pipeline,
+	  get it, by adding its terms to the object under construction.  If
+	  not, this adds nothing to the object under construction.
+
+	  If there is a sort at the beginning of the pipeline, get it, by
+	  adding its terms to the object under construction.  If not, this adds
+	  nothing.
+
+	  Optimization steps in parseCommand make sure that for any pairs
+	  of adjacent matches and sorts, the match comes first.  This ensures
+	  that we sort a minimum of items, and doesn't change the result.
+	  When getCursorMods() examines the pipeline, it looks for an initial
+	  $match.  If present, that is put into pQueryBuilder.  If there is
+	  a query, then the next stage is checked for a $sort, which will go
+	  into pSortBuilder.  If there is no initial $match, then a check is
+	  made for an initial $sort, which will then still be put into
+	  pSortBuilder.
+
+	  As a side-effect, retrieving the Cursor modifications removes them
+	  from the pipeline.
+
+	  @param pQueryBuilder an initialized object builder
+	  @param pSortBuilder an initialized object builder
+	 */
+	void getCursorMods(BSONObjBuilder *pQueryBuilder,
+			   BSONObjBuilder *pSortBuilder);
+
+	/*
+	  Write the Pipeline as a BSONObj command.  This should be the
+	  inverse of parseCommand().
+
+	  This is only intended to be used by the shard command obtained
+	  from splitForSharded().  Some pipeline operations in the merge
+	  process do not have equivalent command forms, and using this on
+	  the mongos Pipeline will cause assertions.
+
+	  @param the builder to write the command to
+	*/
+	void toBson(BSONObjBuilder *pBuilder) const;
+
+	/*
+	  Run the Pipeline on the given source.
+
+	  @param result builder to write the result to
+	  @param errmsg place to put error messages, if any
+	  @param pSource the document source to use at the head of the chain
+	  @returns true on success, false if an error occurs
+	*/
+	bool run(BSONObjBuilder &result, string &errmsg,
+		 intrusive_ptr<DocumentSource> pSource);
+
+	/*
+	  Debugging:  should the processing pipeline be split within
+	  mongod, simulating the real mongos/mongod split?  This is determined
+	  by setting the splitMongodPipeline field in an "aggregate"
+	  command.
+
+	  The split itself is handled by the caller, which is currently
+	  pipeline_command.cpp.
+
+	  @returns true if the pipeline is to be split
+	 */
+	bool getSplitMongodPipeline() const;
+
+	/*
+	  The aggregation command name.
+	 */
+	static const char commandName[];
+
+    private:
+	static const char pipelineName[];
+	static const char fromRouterName[];
+	static const char splitMongodPipelineName[];
+
+        Pipeline(const intrusive_ptr<ExpressionContext> &pCtx);
+
+	string collectionName;
+	typedef vector<intrusive_ptr<DocumentSource> > SourceVector;
+	SourceVector sourceVector;
+
+	bool splitMongodPipeline;
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+} // namespace mongo
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline string Pipeline::getCollectionName() const {
+	return collectionName;
+    }
+
+    inline bool Pipeline::getSplitMongodPipeline() const {
+	if (!DEBUG_BUILD)
+	    return false;
+
+	return splitMongodPipeline;
+    }
+
+} // namespace mongo
+
+
diff --git a/src/mongo/db/commands/pipeline_command.cpp b/src/mongo/db/commands/pipeline_command.cpp
new file mode 100755
index 00000000000..9863e14556c
--- /dev/null
+++ b/src/mongo/db/commands/pipeline_command.cpp
@@ -0,0 +1,187 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/commands/pipeline.h"
+#include "db/cursor.h"
+#include "db/pdfile.h"
+#include "db/pipeline/accumulator.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/document_source.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/queryoptimizer.h"
+
+namespace mongo {
+
+    /** mongodb "commands" (sent via db.$cmd.findOne(...))
+        subclass to make a command.  define a singleton object for it.
+        */
+    class PipelineCommand :
+        public Command {
+    public:
+        // virtuals from Command
+        virtual ~PipelineCommand();
+        virtual bool run(const string &db, BSONObj &cmdObj, int options,
+			 string &errmsg, BSONObjBuilder &result, bool fromRepl);
+        virtual LockType locktype() const;
+        virtual bool slaveOk() const;
+        virtual void help(stringstream &help) const;
+
+        PipelineCommand();
+    };
+
+    // self-registering singleton static instance
+    static PipelineCommand pipelineCommand;
+
+    PipelineCommand::PipelineCommand():
+        Command(Pipeline::commandName) {
+    }
+
+    Command::LockType PipelineCommand::locktype() const {
+        return READ;
+    }
+
+    bool PipelineCommand::slaveOk() const {
+        return true;
+    }
+
+    void PipelineCommand::help(stringstream &help) const {
+        help << "{ pipeline : [ { <data-pipe-op>: {...}}, ... ] }";
+    }
+
+    PipelineCommand::~PipelineCommand() {
+    }
+
+    bool PipelineCommand::run(const string &db, BSONObj &cmdObj,
+			      int options, string &errmsg,
+			      BSONObjBuilder &result, bool fromRepl) {
+
+	intrusive_ptr<ExpressionContext> pCtx(ExpressionContext::create());
+
+	/* try to parse the command; if this fails, then we didn't run */
+	boost::shared_ptr<Pipeline> pPipeline(
+	    Pipeline::parseCommand(errmsg, cmdObj, pCtx));
+	if (!pPipeline.get())
+	    return false;
+
+	/* get a query to use, if any */
+	BSONObjBuilder queryBuilder;
+	BSONObjBuilder sortBuilder;
+	pPipeline->getCursorMods(&queryBuilder, &sortBuilder);
+	BSONObj query(queryBuilder.done());
+	BSONObj sort(sortBuilder.done());
+
+	/* for debugging purposes, show what the query and sort are */
+	DEV {
+	    (log() << "\n---- query BSON\n" <<
+	     query.jsonString(Strict, 1) << "\n----\n").flush();
+	    (log() << "\n---- sort BSON\n" <<
+	     sort.jsonString(Strict, 1) << "\n----\n").flush();
+	}
+	
+	/* create a cursor for that query */
+	string fullName(db + "." + pPipeline->getCollectionName());
+	shared_ptr<Cursor> pCursor(
+	    NamespaceDetailsTransient::getCursor(
+		fullName.c_str(), query
+#ifdef MONGODB_SERVER3832 /* see https://jira.mongodb.org/browse/SERVER-3832 */
+		   , sort
+#endif
+		));
+
+	/* wrap the cursor with a DocumentSource */
+	intrusive_ptr<DocumentSource> pSource(
+	    DocumentSourceCursor::create(pCursor));
+
+	/* this is the normal non-debug path */
+	if (!pPipeline->getSplitMongodPipeline())
+	    return pPipeline->run(result, errmsg, pSource);
+
+	/* setup as if we're in the router */
+	pCtx->setInRouter(true);
+
+	/*
+	  Here, we'll split the pipeline in the same way we would for sharding,
+	  for testing purposes.
+
+	  Run the shard pipeline first, then feed the results into the remains
+	  of the existing pipeline.
+
+	  Start by splitting the pipeline.
+	 */
+	shared_ptr<Pipeline> pShardSplit(
+	    pPipeline->splitForSharded());
+
+	/*
+	  Write the split pipeline as we would in order to transmit it to
+	  the shard servers.
+	*/
+	BSONObjBuilder shardBuilder;
+	pShardSplit->toBson(&shardBuilder);
+	BSONObj shardBson(shardBuilder.done());
+
+	DEV (log() << "\n---- shardBson\n" <<
+	     shardBson.jsonString(Strict, 1) << "\n----\n").flush();
+
+	/* for debugging purposes, show what the pipeline now looks like */
+	DEV {
+	    BSONObjBuilder pipelineBuilder;
+	    pPipeline->toBson(&pipelineBuilder);
+	    BSONObj pipelineBson(pipelineBuilder.done());
+	    (log() << "\n---- pipelineBson\n" <<
+	     pipelineBson.jsonString(Strict, 1) << "\n----\n").flush();
+	}
+
+	/* on the shard servers, create the local pipeline */
+	intrusive_ptr<ExpressionContext> pShardCtx(ExpressionContext::create());
+	shared_ptr<Pipeline> pShardPipeline(
+	    Pipeline::parseCommand(errmsg, shardBson, pShardCtx));
+	if (!pShardPipeline.get()) {
+	    return false;
+	}
+
+	/* run the shard pipeline */
+	BSONObjBuilder shardResultBuilder;
+	string shardErrmsg;
+	pShardPipeline->run(shardResultBuilder, shardErrmsg, pSource);
+	BSONObj shardResult(shardResultBuilder.done());
+
+	/* pick out the shard result, and prepare to read it */
+	intrusive_ptr<DocumentSourceBsonArray> pShardSource;
+	BSONObjIterator shardIter(shardResult);
+	while(shardIter.more()) {
+	    BSONElement shardElement(shardIter.next());
+	    const char *pFieldName = shardElement.fieldName();
+
+	    if (strcmp(pFieldName, "result") == 0) {
+		pShardSource = DocumentSourceBsonArray::create(&shardElement);
+
+	        /*
+		  Connect the output of the shard pipeline with the mongos
+		  pipeline that will merge the results.
+		*/
+		return pPipeline->run(result, errmsg, pShardSource);
+	    }
+	}
+
+	/* NOTREACHED */
+	assert(false);
+	return false;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/common.cpp b/src/mongo/db/common.cpp
new file mode 100644
index 00000000000..cd073f8b059
--- /dev/null
+++ b/src/mongo/db/common.cpp
@@ -0,0 +1,73 @@
+/** @file common.cpp 
+    Common code for server binaries (mongos, mongod, test).  
+    Nothing used by driver should be here. 
+ */
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+//#include "pch.h"
+//#include "concurrency.h"
+#include "jsobjmanipulator.h"
+
+/**
+ * this just has globals
+ */
+namespace mongo {
+
+    /** called by mongos, mongod, test. do not call from clients and such. 
+        invoked before about everything except global var construction.
+     */
+    void doPreServerStartupInits() { 
+#if defined(RLIMIT_NPROC) && defined(RLIMIT_NOFILE)
+      //Check that # of files rlmit > 1000 , and # of processes > # of files/2
+      const unsigned int minNumFiles = 1000;
+      const double filesToProcsRatio = 2.0;
+      struct rlimit rlnproc;
+      struct rlimit rlnofile;
+
+      if(!getrlimit(RLIMIT_NPROC,&rlnproc) && !getrlimit(RLIMIT_NOFILE,&rlnofile)){
+        if(rlnofile.rlim_cur < minNumFiles){
+          log() << "Warning: soft rlimits too low. Number of files is " << rlnofile.rlim_cur << ", should be at least " << minNumFiles << endl;
+        }
+        if(rlnproc.rlim_cur < rlnofile.rlim_cur/filesToProcsRatio){
+          log() << "Warning: soft rlimits too low. " << rlnproc.rlim_cur << " processes, " << rlnofile.rlim_cur << " files. Number of processes should be at least "<< 1/filesToProcsRatio << " times number of files." << endl;
+        }
+      }
+      else{
+        log() << "Warning: getrlimit failed" << endl;
+      }
+#endif
+    }
+
+    NOINLINE_DECL OpTime OpTime::skewed() {
+        bool toLog = false;
+        ONCE toLog = true;
+        RARELY toLog = true;
+        last.i++;
+        if ( last.i & 0x80000000 )
+            toLog = true;
+        if ( toLog ) {
+            log() << "clock skew detected  prev: " << last.secs << " now: " << (unsigned) time(0) << endl;
+        }
+        if ( last.i & 0x80000000 ) {
+            log() << "error large clock skew detected, shutting down" << endl;
+            throw ClockSkewException();
+        }
+        return last;
+    }
+
+}
diff --git a/src/mongo/db/compact.cpp b/src/mongo/db/compact.cpp
new file mode 100644
index 00000000000..32931b6c5fd
--- /dev/null
+++ b/src/mongo/db/compact.cpp
@@ -0,0 +1,376 @@
+/** @file compact.cpp
+   compaction of deleted space in pdfiles (datafiles)
+*/
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,b
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "concurrency.h"
+#include "commands.h"
+#include "curop-inl.h"
+#include "background.h"
+#include "extsort.h"
+#include "compact.h"
+#include "../util/concurrency/task.h"
+#include "../util/timer.h"
+
+namespace mongo {
+
+    char faux;
+
+    void addRecordToRecListInExtent(Record *r, DiskLoc loc);
+    DiskLoc allocateSpaceForANewRecord(const char *ns, NamespaceDetails *d, int lenWHdr, bool god);
+    void freeExtents(DiskLoc firstExt, DiskLoc lastExt);
+
+    /* this should be done in alloc record not here, but doing here for now. 
+       really dumb; it's a start.
+    */
+    unsigned quantizeMask(unsigned x) { 
+        if( x > 4096 * 20 ) 
+            return ~4095;
+        if( x >= 512 ) 
+            return ~63;
+        return ~0;
+    }
+
+    /** @return number of skipped (invalid) documents */
+    unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc ext, int n,
+                const scoped_array<IndexSpec> &indexSpecs,
+                scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, 
+                double pf, int pb)
+    {
+        log() << "compact extent #" << n << endl;
+        unsigned oldObjSize = 0; // we'll report what the old padding was
+        unsigned oldObjSizeWithPadding = 0;
+
+        Extent *e = ext.ext();
+        e->assertOk();
+        assert( e->validates() );
+        unsigned skipped = 0;
+
+        {
+            // the next/prev pointers within the extent might not be in order so we first page the whole thing in 
+            // sequentially
+            log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
+            Timer t;
+            MAdvise adv(e, e->length, MAdvise::Sequential);
+            const char *p = (const char *) e;
+            for( int i = 0; i < e->length; i += 4096 ) { 
+                faux += p[i];
+            }
+            int ms = t.millis();
+            if( ms > 1000 ) 
+                log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl;
+        }
+
+        {
+            log() << "compact copying records" << endl;
+            unsigned totalSize = 0;
+            int nrecs = 0;
+            DiskLoc L = e->firstRecord;
+            if( !L.isNull() ) {
+                while( 1 ) {
+                    Record *recOld = L.rec();
+                    L = recOld->nextInExtent(L);
+                    nrecs++;
+                    BSONObj objOld(recOld);
+
+                    if( !validate || objOld.valid() ) {
+                        unsigned sz = objOld.objsize();
+
+                        oldObjSize += sz;
+                        oldObjSizeWithPadding += recOld->netLength();
+
+                        unsigned lenWHdr = sz + Record::HeaderSize;
+                        unsigned lenWPadding = lenWHdr;
+                        {
+                            lenWPadding = static_cast<unsigned>(pf*lenWPadding);
+                            lenWPadding += pb;
+                            lenWPadding = lenWPadding & quantizeMask(lenWPadding);
+                            if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { 
+                                lenWPadding = lenWHdr;
+                            }
+                        }
+                        totalSize += lenWPadding;
+                        DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false);
+                        uassert(14024, "compact error out of space during compaction", !loc.isNull());
+                        Record *recNew = loc.rec();
+                        recNew = (Record *) getDur().writingPtr(recNew, lenWHdr);
+                        addRecordToRecListInExtent(recNew, loc);
+                        memcpy(recNew->data, objOld.objdata(), sz);
+
+                        {
+                            // extract keys for all indexes we will be rebuilding
+                            for( int x = 0; x < nidx; x++ ) { 
+                                phase1[x].addKeys(indexSpecs[x], objOld, loc);
+                            }
+                        }
+                    }
+                    else { 
+                        if( ++skipped <= 10 )
+                            log() << "compact skipping invalid object" << endl;
+                    }
+
+                    if( L.isNull() ) { 
+                        // we just did the very last record from the old extent.  it's still pointed to 
+                        // by the old extent ext, but that will be fixed below after this loop
+                        break;
+                    }
+
+                    // remove the old records (orphan them) periodically so our commit block doesn't get too large
+                    bool stopping = false;
+                    RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0;
+                    if( stopping || getDur().aCommitIsNeeded() ) {
+                        e->firstRecord.writing() = L;
+                        Record *r = L.rec();
+                        getDur().writingInt(r->prevOfs) = DiskLoc::NullOfs;
+                        getDur().commitIfNeeded();
+                        killCurrentOp.checkForInterrupt(false);
+                    }
+                }
+            } // if !L.isNull()
+
+            assert( d->firstExtent == ext );
+            assert( d->lastExtent != ext );
+            DiskLoc newFirst = e->xnext;
+            d->firstExtent.writing() = newFirst;
+            newFirst.ext()->xprev.writing().Null();
+            getDur().writing(e)->markEmpty();
+            freeExtents(ext,ext);
+            getDur().commitIfNeeded();
+
+            { 
+                double op = 1.0;
+                if( oldObjSize ) 
+                    op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
+                log() << "compact " << nrecs << " documents " << totalSize/1000000.0 << "MB"
+                    << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100
+                    << endl;                    
+            }
+        }
+
+        return skipped;
+    }
+
+    extern SortPhaseOne *precalced;
+
+    bool _compact(const char *ns, NamespaceDetails *d, string& errmsg, bool validate, BSONObjBuilder& result, double pf, int pb) { 
+        //int les = d->lastExtentSize;
+
+        // this is a big job, so might as well make things tidy before we start just to be nice.
+        getDur().commitNow();
+
+        list<DiskLoc> extents;
+        for( DiskLoc L = d->firstExtent; !L.isNull(); L = L.ext()->xnext ) 
+            extents.push_back(L);
+        log() << "compact " << extents.size() << " extents" << endl;
+
+        ProgressMeterHolder pm( cc().curop()->setMessage( "compact extent" , extents.size() ) );
+
+        // same data, but might perform a little different after compact?
+        NamespaceDetailsTransient::get(ns).clearQueryCache();
+
+        int nidx = d->nIndexes;
+        scoped_array<IndexSpec> indexSpecs( new IndexSpec[nidx] );
+        scoped_array<SortPhaseOne> phase1( new SortPhaseOne[nidx] );
+        {
+            NamespaceDetails::IndexIterator ii = d->ii(); 
+            int x = 0;
+            while( ii.more() ) { 
+                BSONObjBuilder b;
+                IndexDetails& idx = ii.next();
+                BSONObj::iterator i(idx.info.obj());
+                while( i.more() ) { 
+                    BSONElement e = i.next();
+                    if( !str::equals(e.fieldName(), "v") && !str::equals(e.fieldName(), "background") ) {
+                        b.append(e);
+                    }
+                }
+                BSONObj o = b.obj().getOwned();
+                phase1[x].sorter.reset( new BSONObjExternalSorter( idx.idxInterface(), o.getObjectField("key") ) );
+                phase1[x].sorter->hintNumObjects( d->stats.nrecords );
+                indexSpecs[x++].reset(o);
+            }
+        }
+
+        log() << "compact orphan deleted lists" << endl;
+        for( int i = 0; i < Buckets; i++ ) { 
+            d->deletedList[i].writing().Null();
+        }
+
+
+
+        // Start over from scratch with our extent sizing and growth
+        d->lastExtentSize=0;
+
+        // before dropping indexes, at least make sure we can allocate one extent!
+        uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1, false).isNull());
+
+        // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here
+        log() << "compact dropping indexes" << endl;
+        BSONObjBuilder b;
+        if( !dropIndexes(d, ns, "*", errmsg, b, true) ) { 
+            errmsg = "compact drop indexes failed";
+            log() << errmsg << endl;
+            return false;
+        }
+
+        getDur().commitNow();
+
+        long long skipped = 0;
+        int n = 0;
+        for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { 
+            skipped += compactExtent(ns, d, *i, n++, indexSpecs, phase1, nidx, validate, pf, pb);
+            pm.hit();
+        }
+
+        if( skipped ) {
+            result.append("invalidObjects", skipped);
+        }
+
+        assert( d->firstExtent.ext()->xprev.isNull() );
+
+        // indexes will do their own progress meter?
+        pm.finished();
+
+        // build indexes
+        NamespaceString s(ns);
+        string si = s.db + ".system.indexes";
+        for( int i = 0; i < nidx; i++ ) {
+            killCurrentOp.checkForInterrupt(false);
+            BSONObj info = indexSpecs[i].info;
+            log() << "compact create index " << info["key"].Obj().toString() << endl;
+            try {
+                precalced = &phase1[i];
+                theDataFileMgr.insert(si.c_str(), info.objdata(), info.objsize());
+            }
+            catch(...) { 
+                precalced = 0;
+                throw;
+            }
+            precalced = 0;
+        }
+
+        return true;
+    }
+
+    bool compact(const string& ns, string &errmsg, bool validate, BSONObjBuilder& result, double pf, int pb) {
+        massert( 14028, "bad ns", NamespaceString::normal(ns.c_str()) );
+        massert( 14027, "can't compact a system namespace", !str::contains(ns, ".system.") ); // items in system.indexes cannot be moved there are pointers to those disklocs in NamespaceDetails
+
+        bool ok;
+        {
+            writelock lk;
+            BackgroundOperation::assertNoBgOpInProgForNs(ns.c_str());
+            Client::Context ctx(ns);
+            NamespaceDetails *d = nsdetails(ns.c_str());
+            massert( 13660, str::stream() << "namespace " << ns << " does not exist", d );
+            massert( 13661, "cannot compact capped collection", !d->capped );
+            log() << "compact " << ns << " begin" << endl;
+            if( pf != 0 || pb != 0 ) { 
+                log() << "paddingFactor:" << pf << " paddingBytes:" << pb << endl;
+            } 
+            try { 
+                ok = _compact(ns.c_str(), d, errmsg, validate, result, pf, pb);
+            }
+            catch(...) { 
+                log() << "compact " << ns << " end (with error)" << endl;
+                throw;
+            }
+            log() << "compact " << ns << " end" << endl;
+        }
+        return ok;
+    }
+
+    bool isCurrentlyAReplSetPrimary();
+
+    class CompactCmd : public Command {
+    public:
+        virtual LockType locktype() const { return NONE; }
+        virtual bool adminOnly() const { return false; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool maintenanceMode() const { return true; }
+        virtual bool logTheOp() { return false; }
+        virtual void help( stringstream& help ) const {
+            help << "compact collection\n"
+                "warning: this operation blocks the server and is slow. you can cancel with cancelOp()\n"
+                "{ compact : <collection_name>, [force:true], [validate:true] }\n"
+                "  force - allows to run on a replica set primary\n"
+                "  validate - check records are noncorrupt before adding to newly compacting extents. slower but safer (default is true in this version)\n";
+        }
+        virtual bool requiresAuth() { return true; }
+        CompactCmd() : Command("compact") { }
+
+        virtual bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string coll = cmdObj.firstElement().valuestr();
+            if( coll.empty() || db.empty() ) {
+                errmsg = "no collection name specified";
+                return false;
+            }
+
+            if( isCurrentlyAReplSetPrimary() && !cmdObj["force"].trueValue() ) { 
+                errmsg = "will not run compact on an active replica set primary as this is a slow blocking operation. use force:true to force";
+                return false;
+            }
+            
+            string ns = db + '.' + coll;
+            if ( ! NamespaceString::normal(ns.c_str()) ) {
+                errmsg = "bad namespace name";
+                return false;
+            }
+            
+            // parameter validation to avoid triggering assertions in compact()
+            if ( str::contains(ns, ".system.") ) {
+                errmsg = "can't compact a system namespace";
+                return false;
+            }
+            
+            {
+                writelock lk;
+                Client::Context ctx(ns);
+                NamespaceDetails *d = nsdetails(ns.c_str());
+                if( ! d ) {
+                    errmsg = "namespace does not exist";
+                    return false;
+                }
+
+                if ( d->capped ) {
+                    errmsg = "cannot compact a capped collection";
+                    return false;
+                }
+            }
+
+            double pf = 1.0;
+            int pb = 0;
+            if( cmdObj.hasElement("paddingFactor") ) {
+                pf = cmdObj["paddingFactor"].Number();
+                assert( pf >= 1.0 && pf <= 4.0 );
+            }
+            if( cmdObj.hasElement("paddingBytes") ) {
+                pb = (int) cmdObj["paddingBytes"].Number();
+                assert( pb >= 0 && pb <= 1024 * 1024 );
+            }
+
+            bool validate = !cmdObj.hasElement("validate") || cmdObj["validate"].trueValue(); // default is true at the moment
+            bool ok = compact(ns, errmsg, validate, result, pf, pb);
+            return ok;
+        }
+    };
+    static CompactCmd compactCmd;
+
+}
diff --git a/src/mongo/db/compact.h b/src/mongo/db/compact.h
new file mode 100644
index 00000000000..7bf49c8e1b8
--- /dev/null
+++ b/src/mongo/db/compact.h
@@ -0,0 +1,50 @@
+// compact.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    /** for bottom up fastbuildindex (where we presort keys) */
+    struct SortPhaseOne { 
+        SortPhaseOne() { 
+            n = 0;
+            nkeys = 0;
+            multi = false;
+        }
+        shared_ptr<BSONObjExternalSorter> sorter;
+        unsigned long long n; // # of records
+        unsigned long long nkeys;
+        bool multi; // multikey index
+
+        void addKeys(const IndexSpec& spec, const BSONObj& o, DiskLoc loc) { 
+            BSONObjSet keys;
+            spec.getKeys(o, keys);
+            int k = 0;
+            for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
+                if( ++k == 2 ) {
+                    multi = true;
+                }
+                sorter->add(*i, loc);
+                nkeys++;
+            }
+            n++;
+        }
+    };
+
+}
diff --git a/src/mongo/db/concurrency.h b/src/mongo/db/concurrency.h
new file mode 100644
index 00000000000..33bc0caac77
--- /dev/null
+++ b/src/mongo/db/concurrency.h
@@ -0,0 +1,21 @@
+// @file concurrency.h
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "mongomutex.h"
diff --git a/src/mongo/db/curop-inl.h b/src/mongo/db/curop-inl.h
new file mode 100644
index 00000000000..7dd678b185d
--- /dev/null
+++ b/src/mongo/db/curop-inl.h
@@ -0,0 +1 @@
+#include "curop.h"
diff --git a/src/mongo/db/curop.cpp b/src/mongo/db/curop.cpp
new file mode 100644
index 00000000000..3cc452b46cc
--- /dev/null
+++ b/src/mongo/db/curop.cpp
@@ -0,0 +1,173 @@
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "curop.h"
+#include "database.h"
+
+namespace mongo {
+
+    // todo : move more here
+
+    CurOp::CurOp( Client * client , CurOp * wrapped ) : 
+        _client(client), 
+        _wrapped(wrapped) 
+    {
+        if ( _wrapped )
+            _client->_curOp = this;
+        _start = _checkpoint = 0;
+        _active = false;
+        _reset();
+        _op = 0;
+        // These addresses should never be written to again.  The zeroes are
+        // placed here as a precaution because currentOp may be accessed
+        // without the db mutex.
+        memset(_ns, 0, sizeof(_ns));
+    }
+
+    void CurOp::_reset() {
+        _command = false;
+        _lockType = 0;
+        _dbprofile = 0;
+        _end = 0;
+        _waitingForLock = false;
+        _message = "";
+        _progressMeter.finished();
+        _killed = false;
+        _numYields = 0;
+    }
+
+    void CurOp::reset() {
+        _reset();
+        _start = _checkpoint = 0;
+        _opNum = _nextOpNum++;
+        _ns[0] = 0;
+        _debug.reset();
+        _query.reset();
+        _active = true; // this should be last for ui clarity
+    }
+
+    void CurOp::reset( const HostAndPort& remote, int op ) {
+        reset();
+        if( _remote != remote ) {
+            // todo : _remote is not thread safe yet is used as such!
+            _remote = remote;
+        }
+        _op = op;
+    }
+        
+    ProgressMeter& CurOp::setMessage( const char * msg , unsigned long long progressMeterTotal , int secondsBetween ) {
+        if ( progressMeterTotal ) {
+            if ( _progressMeter.isActive() ) {
+                cout << "about to assert, old _message: " << _message << " new message:" << msg << endl;
+                assert( ! _progressMeter.isActive() );
+            }
+            _progressMeter.reset( progressMeterTotal , secondsBetween );
+        }
+        else {
+            _progressMeter.finished();
+        }
+        _message = msg;
+        return _progressMeter;
+    }
+
+
+    BSONObj CurOp::info() {
+        if( ! cc().getAuthenticationInfo()->isAuthorized("admin") ) {
+            BSONObjBuilder b;
+            b.append("err", "unauthorized");
+            return b.obj();
+        }
+        return infoNoauth();
+    }
+
+    CurOp::~CurOp() {
+        if ( _wrapped ) {
+            scoped_lock bl(Client::clientsMutex);
+            _client->_curOp = _wrapped;
+        }
+        _client = 0;
+    }
+
+    void CurOp::enter( Client::Context * context ) {
+        ensureStarted();
+        setNS( context->ns() );
+        _dbprofile = context->_db ? context->_db->profile : 0;
+    }
+    
+    void CurOp::leave( Client::Context * context ) {
+        unsigned long long now = curTimeMicros64();
+        Top::global.record( _ns , _op , _lockType , now - _checkpoint , _command );
+        _checkpoint = now;
+    }
+
+    BSONObj CurOp::infoNoauth() {
+        BSONObjBuilder b;
+        b.append("opid", _opNum);
+        bool a = _active && _start;
+        b.append("active", a);
+        if ( _lockType )
+            b.append("lockType" , _lockType > 0 ? "write" : "read"  );
+        b.append("waitingForLock" , _waitingForLock );
+
+        if( a ) {
+            b.append("secs_running", elapsedSeconds() );
+        }
+
+        b.append( "op" , opToString( _op ) );
+
+        b.append("ns", _ns);
+
+        _query.append( b , "query" );
+
+        if( !_remote.empty() ) {
+            b.append("client", _remote.toString());
+        }
+
+        if ( _client ) {
+            b.append( "desc" , _client->desc() );
+            if ( _client->_threadId.size() ) 
+                b.append( "threadId" , _client->_threadId );
+            if ( _client->_connectionId )
+                b.appendNumber( "connectionId" , _client->_connectionId );
+        }
+        
+        if ( ! _message.empty() ) {
+            if ( _progressMeter.isActive() ) {
+                StringBuilder buf(128);
+                buf << _message.toString() << " " << _progressMeter.toString();
+                b.append( "msg" , buf.str() );
+                BSONObjBuilder sub( b.subobjStart( "progress" ) );
+                sub.appendNumber( "done" , (long long)_progressMeter.done() );
+                sub.appendNumber( "total" , (long long)_progressMeter.total() );
+                sub.done();
+            }
+            else {
+                b.append( "msg" , _message.toString() );
+            }
+        }
+
+        if( killed() ) 
+            b.append("killed", true);
+        
+        b.append( "numYields" , _numYields );
+
+        return b.obj();
+    }
+
+    AtomicUInt CurOp::_nextOpNum;
+
+}
diff --git a/src/mongo/db/curop.h b/src/mongo/db/curop.h
new file mode 100644
index 00000000000..192404d8796
--- /dev/null
+++ b/src/mongo/db/curop.h
@@ -0,0 +1,313 @@
+// @file curop.h
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include "namespace-inl.h"
+#include "client.h"
+#include "../bson/util/atomic_int.h"
+#include "../util/concurrency/spin_lock.h"
+#include "../util/time_support.h"
+#include "../util/net/hostandport.h"
+
+namespace mongo {
+
+    class CurOp;
+
+    /* lifespan is different than CurOp because of recursives with DBDirectClient */
+    class OpDebug {
+    public:
+        OpDebug() : ns(""){ reset(); }
+
+        void reset();
+        
+        string toString() const;
+        void append( const CurOp& curop, BSONObjBuilder& b ) const;
+
+        // -------------------
+        
+        StringBuilder extra; // weird things we need to fix later
+        
+        // basic options
+        int op;
+        bool iscommand;
+        Namespace ns;
+        BSONObj query;
+        BSONObj updateobj;
+        
+        // detailed options
+        long long cursorid;
+        int ntoreturn;
+        int ntoskip;
+        bool exhaust;
+
+        // debugging/profile info
+        int nscanned;
+        bool idhack;         // indicates short circuited code path on an update to make the update faster
+        bool scanAndOrder;   // scanandorder query plan aspect was used
+        bool moved;          // update resulted in a move (moves are expensive)
+        bool fastmod;
+        bool fastmodinsert;  // upsert of an $operation. builds a default object
+        bool upsert;         // true if the update actually did an insert
+        int keyUpdates;
+
+        // error handling
+        ExceptionInfo exceptionInfo;
+        
+        // response info
+        int executionTime;
+        int nreturned;
+        int responseLength;
+    };
+
+    /**
+     * stores a copy of a bson obj in a fixed size buffer
+     * if its too big for the buffer, says "too big"
+     * useful for keeping a copy around indefinitely without wasting a lot of space or doing malloc
+     */
+    class CachedBSONObj {
+    public:
+        enum { TOO_BIG_SENTINEL = 1 } ;
+        static BSONObj _tooBig; // { $msg : "query not recording (too large)" }
+
+        CachedBSONObj() {
+            _size = (int*)_buf;
+            reset();
+        }
+
+        void reset( int sz = 0 ) {
+            _lock.lock();
+            _reset( sz );
+            _lock.unlock();
+        }
+
+        void set( const BSONObj& o ) {
+            scoped_spinlock lk(_lock);
+            int sz = o.objsize();
+            if ( sz > (int) sizeof(_buf) ) {
+                _reset(TOO_BIG_SENTINEL);
+            }
+            else {
+                memcpy(_buf, o.objdata(), sz );
+            }
+        }
+
+        int size() const { return *_size; }
+        bool have() const { return size() > 0; }
+
+        BSONObj get() const {
+            scoped_spinlock lk(_lock);
+            return _get();
+        }
+
+        void append( BSONObjBuilder& b , const StringData& name ) const {
+            scoped_spinlock lk(_lock);
+            BSONObj temp = _get();
+            b.append( name , temp );
+        }
+
+    private:
+        /** you have to be locked when you call this */
+        BSONObj _get() const {
+            int sz = size();
+            if ( sz == 0 )
+                return BSONObj();
+            if ( sz == TOO_BIG_SENTINEL )
+                return _tooBig;
+            return BSONObj( _buf ).copy();
+        }
+
+        /** you have to be locked when you call this */
+        void _reset( int sz ) { _size[0] = sz; }
+
+        mutable SpinLock _lock;
+        int * _size;
+        char _buf[512];
+    };
+
+    /* Current operation (for the current Client).
+       an embedded member of Client class, and typically used from within the mutex there.
+    */
+    class CurOp : boost::noncopyable {
+    public:
+        CurOp( Client * client , CurOp * wrapped = 0 );
+        ~CurOp();
+
+        bool haveQuery() const { return _query.have(); }
+        BSONObj query() { return _query.get();  }
+        void appendQuery( BSONObjBuilder& b , const StringData& name ) const { _query.append( b , name ); }
+        
+        void ensureStarted() {
+            if ( _start == 0 )
+                _start = _checkpoint = curTimeMicros64();
+        }
+        bool isStarted() const { return _start > 0; }
+        void enter( Client::Context * context );
+        void leave( Client::Context * context );
+        void reset();
+        void reset( const HostAndPort& remote, int op );
+        void markCommand() { _command = true; }
+
+        void waitingForLock( int type ) {
+            _waitingForLock = true;
+            if ( type > 0 )
+                _lockType = 1;
+            else
+                _lockType = -1;
+        }
+        void gotLock()             { _waitingForLock = false; }
+        OpDebug& debug()           { return _debug; }
+        int profileLevel() const   { return _dbprofile; }
+        const char * getNS() const { return _ns; }
+
+        bool shouldDBProfile( int ms ) const {
+            if ( _dbprofile <= 0 )
+                return false;
+
+            return _dbprofile >= 2 || ms >= cmdLine.slowMS;
+        }
+
+        AtomicUInt opNum() const { return _opNum; }
+
+        /** if this op is running */
+        bool active() const { return _active; }
+
+        int getLockType() const { return _lockType; }
+        bool isWaitingForLock() const { return _waitingForLock; }
+        int getOp() const { return _op; }
+        unsigned long long startTime() { // micros
+            ensureStarted();
+            return _start;
+        }
+        void done() {
+            _active = false;
+            _end = curTimeMicros64();
+        }
+        unsigned long long totalTimeMicros() {
+            massert( 12601 , "CurOp not marked done yet" , ! _active );
+            return _end - startTime();
+        }
+        int totalTimeMillis() { return (int) (totalTimeMicros() / 1000); }
+        int elapsedMillis() {
+            unsigned long long total = curTimeMicros64() - startTime();
+            return (int) (total / 1000);
+        }
+        int elapsedSeconds() { return elapsedMillis() / 1000; }
+        void setQuery(const BSONObj& query) { _query.set( query ); }
+        Client * getClient() const { return _client; }
+        BSONObj info();
+        BSONObj infoNoauth();
+        string getRemoteString( bool includePort = true ) { return _remote.toString(includePort); }
+        ProgressMeter& setMessage( const char * msg , unsigned long long progressMeterTotal = 0 , int secondsBetween = 3 );
+        string getMessage() const { return _message.toString(); }
+        ProgressMeter& getProgressMeter() { return _progressMeter; }
+        CurOp *parent() const { return _wrapped; }
+        void kill() { _killed = true; }
+        bool killed() const { return _killed; }
+        void yielded() { _numYields++; }
+        void setNS(const char *ns) {
+            strncpy(_ns, ns, Namespace::MaxNsLen);
+            _ns[Namespace::MaxNsLen] = 0;
+        }
+
+    private:
+        friend class Client;
+        void _reset();
+
+        static AtomicUInt _nextOpNum;
+        Client * _client;
+        CurOp * _wrapped;
+        unsigned long long _start;
+        unsigned long long _checkpoint;
+        unsigned long long _end;
+        bool _active;
+        int _op;
+        bool _command;
+        int _lockType;                   // see concurrency.h for values
+        bool _waitingForLock;
+        int _dbprofile;                  // 0=off, 1=slow, 2=all
+        AtomicUInt _opNum;               // todo: simple being "unsigned" may make more sense here
+        char _ns[Namespace::MaxNsLen+2];
+        HostAndPort _remote;             // CAREFUL here with thread safety
+        CachedBSONObj _query;            // CachedBSONObj is thread safe
+        OpDebug _debug;
+        ThreadSafeString _message;
+        ProgressMeter _progressMeter;
+        volatile bool _killed;
+        int _numYields;
+    };
+
+    /* _globalKill: we are shutting down
+       otherwise kill attribute set on specified CurOp
+       this class does not handle races between interruptJs and the checkForInterrupt functions - those must be
+       handled by the client of this class
+    */
+    extern class KillCurrentOp {
+    public:
+        void killAll();
+        void kill(AtomicUInt i);
+
+        /** @return true if global interrupt and should terminate the operation */
+        bool globalInterruptCheck() const { return _globalKill; }
+
+        void checkForInterrupt( bool heedMutex = true ) {
+            Client& c = cc();
+            if ( heedMutex && d.dbMutex.isWriteLocked() )
+                return;
+            if( _globalKill )
+                uasserted(11600,"interrupted at shutdown");
+            if( c.curop()->killed() )
+                uasserted(11601,"interrupted");
+            if( c.sometimes(1024) ) {
+                AbstractMessagingPort *p = cc().port();
+                if( p ) 
+                    p->assertStillConnected();
+            }
+        }
+
+        /** @return "" if not interrupted.  otherwise, you should stop. */
+        const char *checkForInterruptNoAssert( /*bool heedMutex = true*/ ) {
+            Client& c = cc();
+            // always called withi false so commented out:
+            /*if ( heedMutex && d.dbMutex.isWriteLocked() )
+                return "";*/
+            if( _globalKill )
+                return "interrupted at shutdown";
+            if( c.curop()->killed() )
+                return "interrupted";
+            if( c.sometimes(1024) ) {
+                try { 
+                    AbstractMessagingPort *p = cc().port();
+                    if( p ) 
+                        p->assertStillConnected();
+                }
+                catch(...) { 
+                    log() << "no longer connected to client";
+                    return "no longer connected to client";
+                }
+            }
+            return "";
+        }
+
+    private:
+        void interruptJs( AtomicUInt *op );
+        volatile bool _globalKill;
+    } killCurrentOp;
+
+}
diff --git a/src/mongo/db/cursor.cpp b/src/mongo/db/cursor.cpp
new file mode 100644
index 00000000000..ac7afc1532b
--- /dev/null
+++ b/src/mongo/db/cursor.cpp
@@ -0,0 +1,166 @@
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "pdfile.h"
+#include "curop-inl.h"
+
+namespace mongo {
+
+    bool BasicCursor::advance() {
+        killCurrentOp.checkForInterrupt();
+        if ( eof() ) {
+            if ( tailable_ && !last.isNull() ) {
+                curr = s->next( last );
+            }
+            else {
+                return false;
+            }
+        }
+        else {
+            last = curr;
+            curr = s->next( curr );
+        }
+        incNscanned();
+        return ok();
+    }
+
+    /* these will be used outside of mutexes - really functors - thus the const */
+    class Forward : public AdvanceStrategy {
+        virtual DiskLoc next( const DiskLoc &prev ) const {
+            return prev.rec()->getNext( prev );
+        }
+    } _forward;
+
+    class Reverse : public AdvanceStrategy {
+        virtual DiskLoc next( const DiskLoc &prev ) const {
+            return prev.rec()->getPrev( prev );
+        }
+    } _reverse;
+
+    const AdvanceStrategy *forward() {
+        return &_forward;
+    }
+    const AdvanceStrategy *reverse() {
+        return &_reverse;
+    }
+
+    DiskLoc nextLoop( NamespaceDetails *nsd, const DiskLoc &prev ) {
+        assert( nsd->capLooped() );
+        DiskLoc next = forward()->next( prev );
+        if ( !next.isNull() )
+            return next;
+        return nsd->firstRecord();
+    }
+
+    DiskLoc prevLoop( NamespaceDetails *nsd, const DiskLoc &curr ) {
+        assert( nsd->capLooped() );
+        DiskLoc prev = reverse()->next( curr );
+        if ( !prev.isNull() )
+            return prev;
+        return nsd->lastRecord();
+    }
+
+    ForwardCappedCursor::ForwardCappedCursor( NamespaceDetails *_nsd, const DiskLoc &startLoc ) :
+        nsd( _nsd ) {
+        if ( !nsd )
+            return;
+        DiskLoc start = startLoc;
+        if ( start.isNull() ) {
+            if ( !nsd->capLooped() )
+                start = nsd->firstRecord();
+            else {
+                start = nsd->capExtent.ext()->firstRecord;
+                if ( !start.isNull() && start == nsd->capFirstNewRecord ) {
+                    start = nsd->capExtent.ext()->lastRecord;
+                    start = nextLoop( nsd, start );
+                }
+            }
+        }
+        curr = start;
+        s = this;
+        incNscanned();
+    }
+
+    DiskLoc ForwardCappedCursor::next( const DiskLoc &prev ) const {
+        assert( nsd );
+        if ( !nsd->capLooped() )
+            return forward()->next( prev );
+
+        DiskLoc i = prev;
+        // Last record
+        if ( i == nsd->capExtent.ext()->lastRecord )
+            return DiskLoc();
+        i = nextLoop( nsd, i );
+        // If we become capFirstNewRecord from same extent, advance to next extent.
+        if ( i == nsd->capFirstNewRecord &&
+                i != nsd->capExtent.ext()->firstRecord )
+            i = nextLoop( nsd, nsd->capExtent.ext()->lastRecord );
+        // If we have just gotten to beginning of capExtent, skip to capFirstNewRecord
+        if ( i == nsd->capExtent.ext()->firstRecord )
+            i = nsd->capFirstNewRecord;
+        return i;
+    }
+
+    ReverseCappedCursor::ReverseCappedCursor( NamespaceDetails *_nsd, const DiskLoc &startLoc ) :
+        nsd( _nsd ) {
+        if ( !nsd )
+            return;
+        DiskLoc start = startLoc;
+        if ( start.isNull() ) {
+            if ( !nsd->capLooped() ) {
+                start = nsd->lastRecord();
+            }
+            else {
+                start = nsd->capExtent.ext()->lastRecord;
+            }
+        }
+        curr = start;
+        s = this;
+        incNscanned();
+    }
+
+    DiskLoc ReverseCappedCursor::next( const DiskLoc &prev ) const {
+        assert( nsd );
+        if ( !nsd->capLooped() )
+            return reverse()->next( prev );
+
+        DiskLoc i = prev;
+        // Last record
+        if ( nsd->capFirstNewRecord == nsd->capExtent.ext()->firstRecord ) {
+            if ( i == nextLoop( nsd, nsd->capExtent.ext()->lastRecord ) ) {
+                return DiskLoc();
+            }
+        }
+        else {
+            if ( i == nsd->capExtent.ext()->firstRecord ) {
+                return DiskLoc();
+            }
+        }
+        // If we are capFirstNewRecord, advance to prev extent, otherwise just get prev.
+        if ( i == nsd->capFirstNewRecord )
+            i = prevLoop( nsd, nsd->capExtent.ext()->firstRecord );
+        else
+            i = prevLoop( nsd, i );
+        // If we just became last in cap extent, advance past capFirstNewRecord
+        // (We know capExtent.ext()->firstRecord != capFirstNewRecord, since would
+        // have returned DiskLoc() earlier otherwise.)
+        if ( i == nsd->capExtent.ext()->lastRecord )
+            i = reverse()->next( nsd->capFirstNewRecord );
+
+        return i;
+    }
+} // namespace mongo
diff --git a/src/mongo/db/cursor.h b/src/mongo/db/cursor.h
new file mode 100644
index 00000000000..8e9e922733d
--- /dev/null
+++ b/src/mongo/db/cursor.h
@@ -0,0 +1,246 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+
+#include "jsobj.h"
+#include "diskloc.h"
+#include "matcher.h"
+
+namespace mongo {
+
+    class NamespaceDetails;
+    class Record;
+    class CoveredIndexMatcher;
+
+    /* Query cursors, base class.  This is for our internal cursors.  "ClientCursor" is a separate
+       concept and is for the user's cursor.
+
+       WARNING concurrency: the vfunctions below are called back from within a
+       ClientCursor::ccmutex.  Don't cause a deadlock, you've been warned.
+    */
+    class Cursor : boost::noncopyable {
+    public:
+        virtual ~Cursor() {}
+        virtual bool ok() = 0;
+        bool eof() { return !ok(); }
+        virtual Record* _current() = 0;
+        virtual BSONObj current() = 0;
+        virtual DiskLoc currLoc() = 0;
+        virtual bool advance() = 0; /*true=ok*/
+        virtual BSONObj currKey() const { return BSONObj(); }
+
+        // DiskLoc the cursor requires for continued operation.  Before this
+        // DiskLoc is deleted, the cursor must be incremented or destroyed.
+        virtual DiskLoc refLoc() = 0;
+
+        /* Implement these if you want the cursor to be "tailable" */
+
+        /* Request that the cursor starts tailing after advancing past last record. */
+        /* The implementation may or may not honor this request. */
+        virtual void setTailable() {}
+        /* indicates if tailing is enabled. */
+        virtual bool tailable() {
+            return false;
+        }
+
+        virtual void aboutToDeleteBucket(const DiskLoc& b) { }
+
+        /* optional to implement.  if implemented, means 'this' is a prototype */
+        virtual Cursor* clone() {
+            return 0;
+        }
+
+        virtual BSONObj indexKeyPattern() {
+            return BSONObj();
+        }
+
+        virtual bool supportGetMore() = 0;
+
+        /* called after every query block is iterated -- i.e. between getMore() blocks
+           so you can note where we are, if necessary.
+           */
+        virtual void noteLocation() { }
+
+        /* called before query getmore block is iterated */
+        virtual void checkLocation() { }
+
+        /**
+         * Called before a document pointed at by an earlier iterate of this cursor is to be
+         * modified.  It is ok if the current iterate also points to the document to be modified.
+         */
+        virtual void prepareToTouchEarlierIterate() { noteLocation(); }
+
+        /** Recover from a previous call to prepareToTouchEarlierIterate(). */
+        virtual void recoverFromTouchingEarlierIterate() { checkLocation(); }
+
+        virtual bool supportYields() = 0;
+
+        /** Called before a ClientCursor yield. */
+        virtual bool prepareToYield() { noteLocation(); return supportYields(); }
+        
+        /** Called after a ClientCursor yield.  Recovers from a previous call to prepareToYield(). */
+        virtual void recoverFromYield() { checkLocation(); }
+
+        virtual string toString() { return "abstract?"; }
+
+        /* used for multikey index traversal to avoid sending back dups. see Matcher::matches().
+           if a multikey index traversal:
+             if loc has already been sent, returns true.
+             otherwise, marks loc as sent.
+        */
+        virtual bool getsetdup(DiskLoc loc) = 0;
+
+        virtual bool isMultiKey() const = 0;
+
+        virtual bool autoDedup() const { return true; }
+
+        /**
+         * return true if the keys in the index have been modified from the main doc
+         * if you have { a : 1 , b : [ 1 , 2 ] }
+         * an index on { a : 1 } would not be modified
+         * an index on { b : 1 } would be since the values of the array are put in the index
+         *                       not the array
+         */
+        virtual bool modifiedKeys() const = 0;
+
+        virtual BSONObj prettyIndexBounds() const { return BSONArray(); }
+
+        virtual bool capped() const { return false; }
+
+        virtual long long nscanned() = 0;
+
+        // The implementation may return different matchers depending on the
+        // position of the cursor.  If matcher() is nonzero at the start,
+        // matcher() should be checked each time advance() is called.
+        // Implementations which generate their own matcher should return this
+        // to avoid a matcher being set manually.
+        // Note that the return values differ subtly here
+
+        // Used when we want fast matcher lookup
+        virtual CoveredIndexMatcher *matcher() const { return 0; }
+        // Used when we need to share this matcher with someone else
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return shared_ptr< CoveredIndexMatcher >(); }
+
+        virtual bool currentMatches( MatchDetails *details = 0 ) {
+            return !matcher() || matcher()->matchesCurrent( this, details );
+        }
+
+        // A convenience function for setting the value of matcher() manually
+        // so it may accessed later.  Implementations which must generate
+        // their own matcher() should assert here.
+        virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) {
+            massert( 13285, "manual matcher config not allowed", false );
+        }
+
+        virtual void explainDetails( BSONObjBuilder& b ) { return; }
+    };
+
+    // strategy object implementing direction of traversal.
+    class AdvanceStrategy {
+    public:
+        virtual ~AdvanceStrategy() { }
+        virtual DiskLoc next( const DiskLoc &prev ) const = 0;
+    };
+
+    const AdvanceStrategy *forward();
+    const AdvanceStrategy *reverse();
+
+    /* table-scan style cursor */
+    class BasicCursor : public Cursor {
+    public:
+        BasicCursor(DiskLoc dl, const AdvanceStrategy *_s = forward()) : curr(dl), s( _s ), _nscanned() {
+            incNscanned();
+            init();
+        }
+        BasicCursor(const AdvanceStrategy *_s = forward()) : s( _s ), _nscanned() {
+            init();
+        }
+        bool ok() { return !curr.isNull(); }
+        Record* _current() {
+            assert( ok() );
+            return curr.rec();
+        }
+        BSONObj current() {
+            Record *r = _current();
+            BSONObj j(r);
+            return j;
+        }
+        virtual DiskLoc currLoc() { return curr; }
+        virtual DiskLoc refLoc()  { return curr.isNull() ? last : curr; }
+        bool advance();
+        virtual string toString() { return "BasicCursor"; }
+        virtual void setTailable() {
+            if ( !curr.isNull() || !last.isNull() )
+                tailable_ = true;
+        }
+        virtual bool tailable() { return tailable_; }
+        virtual bool getsetdup(DiskLoc loc) { return false; }
+        virtual bool isMultiKey() const { return false; }
+        virtual bool modifiedKeys() const { return false; }
+        virtual bool supportGetMore() { return true; }
+        virtual bool supportYields() { return true; }
+        virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; }
+        virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher; }
+        virtual long long nscanned() { return _nscanned; }
+
+    protected:
+        DiskLoc curr, last;
+        const AdvanceStrategy *s;
+        void incNscanned() { if ( !curr.isNull() ) { ++_nscanned; } }
+    private:
+        bool tailable_;
+        shared_ptr< CoveredIndexMatcher > _matcher;
+        long long _nscanned;
+        void init() { tailable_ = false; }
+    };
+
+    /* used for order { $natural: -1 } */
+    class ReverseCursor : public BasicCursor {
+    public:
+        ReverseCursor(DiskLoc dl) : BasicCursor( dl, reverse() ) { }
+        ReverseCursor() : BasicCursor( reverse() ) { }
+        virtual string toString() { return "ReverseCursor"; }
+    };
+
+    class ForwardCappedCursor : public BasicCursor, public AdvanceStrategy {
+    public:
+        ForwardCappedCursor( NamespaceDetails *nsd = 0, const DiskLoc &startLoc = DiskLoc() );
+        virtual string toString() {
+            return "ForwardCappedCursor";
+        }
+        virtual DiskLoc next( const DiskLoc &prev ) const;
+        virtual bool capped() const { return true; }
+    private:
+        NamespaceDetails *nsd;
+    };
+
+    class ReverseCappedCursor : public BasicCursor, public AdvanceStrategy {
+    public:
+        ReverseCappedCursor( NamespaceDetails *nsd = 0, const DiskLoc &startLoc = DiskLoc() );
+        virtual string toString() {
+            return "ReverseCappedCursor";
+        }
+        virtual DiskLoc next( const DiskLoc &prev ) const;
+        virtual bool capped() const { return true; }
+    private:
+        NamespaceDetails *nsd;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/d_concurrency.cpp b/src/mongo/db/d_concurrency.cpp
new file mode 100755
index 00000000000..e3ad974cbfc
--- /dev/null
+++ b/src/mongo/db/d_concurrency.cpp
@@ -0,0 +1,231 @@
+// @file d_concurrency.cpp 
+
+#include "pch.h"
+#include "d_concurrency.h"
+#include "../util/concurrency/threadlocal.h"
+#include "../util/concurrency/rwlock.h"
+#include "../util/concurrency/value.h"
+#include "../util/assert_util.h"
+#include "client.h"
+#include "namespacestring.h"
+#include "d_globals.h"
+
+// oplog locking
+// no top level read locks
+// system.profile writing
+// oplog now
+// yielding
+// commitIfNeeded
+
+namespace mongo {
+
+    using namespace clcimpl;
+
+    Client::LockStatus::LockStatus() { 
+        excluder=global=collection=0;
+    }
+
+    namespace clcimpl {
+        Shared::Shared(unsigned& _state, RWLock& lock) : state(_state) {
+            rw = 0;
+            if( state ) { 
+                // already locked
+                dassert( (state & (AcquireShared|AcquireExclusive)) == 0 );
+                return;
+            }
+            rw = &lock;
+            state = AcquireShared;
+            rw->lock_shared();
+            state = LockedShared;
+        }
+        Shared::~Shared() { 
+            if( rw ) {
+                state = Unlocked;
+                rw->unlock_shared();
+            }
+        }
+        Exclusive::Exclusive(unsigned& _state, RWLock& lock) : state(_state) { 
+            rw = 0;
+            if( state ) { 
+                // already locked
+                dassert( (state & (AcquireShared|AcquireExclusive)) == 0 );
+                assert( state == LockedExclusive ); // can't be in shared state
+                return;
+            }
+            rw = &lock;
+            state = AcquireExclusive;
+            rw->lock();
+            state = LockedExclusive;
+        }
+        Exclusive::~Exclusive() { 
+            if( rw ) {
+                state = Unlocked;
+                rw->unlock();
+            }
+        }
+    } // clcimpl namespace
+
+    // this tie-in temporary until MongoMutex is folded in more directly.
+    // called when the lock has been achieved
+    void MongoMutex::lockedExclusively() {
+        Client& c = cc();
+        curopGotLock(&c); // hopefully lockStatus replaces one day
+        c.lockStatus.global = clcimpl::LockedExclusive;
+        _minfo.entered(); // hopefully eliminate one day 
+    }
+
+    void MongoMutex::unlockingExclusively() {
+        Client& c = cc();
+        _minfo.leaving();
+        c.lockStatus.global = Unlocked;
+    }
+
+    MongoMutex::MongoMutex(const char *name) : _m(name) {
+        static int n = 0;
+        assert( ++n == 1 ); // below releasingWriteLock we assume MongoMutex is a singleton, and uses dbMutex ref above
+        _remapPrivateViewRequested = false;
+    }
+
+    bool subcollectionOf(const string& parent, const char *child) {
+        if( parent == child ) 
+            return true;
+        if( !str::startsWith(child, parent) )
+            return false;
+        const char *p = child + parent.size();
+        uassert(15963, str::stream() << "bad collection name: " << child, !str::endsWith(p, '.'));
+        return *p == '.' && p[1] == '$';
+    }
+
+    // (maybe tbd) ...
+    // we will use the global write lock for writing to system.* collections for simplicity 
+    // for now; this has some advantages in speed as we don't need to latch just for that then; 
+    // also there are cases to be handled carefully otherwise such as namespacedetails methods
+    // reaching into system.indexes implicitly
+    // exception : system.profile
+    static bool lkspecial(const string& ns) { 
+        NamespaceString s(ns);
+        return s.isSystem() && s.coll != "system.profile";
+    }
+
+    /** Notes on d.writeExcluder
+        we want to be able to block any attempted write while allowing reads; additionally 
+        force non-greedy acquisition so that reads can continue -- 
+        that is, disallow greediness of write lock acquisitions.  This is for that purpose.  The 
+        #1 need is by groupCommitWithLimitedLocks() but useful elsewhere such as for lock and fsync.
+    */
+
+    ExcludeAllWrites::ExcludeAllWrites() : 
+        lk(cc().lockStatus.excluder, d.writeExcluder), 
+        gslk()
+    {
+        LOG(3) << "ExcludeAllWrites" << endl;
+        wassert( !d.dbMutex.isWriteLocked() );
+    };
+    ExcludeAllWrites::~ExcludeAllWrites() {
+    }
+
+    // CLC turns on the "collection level concurrency" code 
+    // (which is under development and not finished)
+#if defined(CLC)
+    // called after a context is set. check that the correct collection is locked
+    void Client::checkLocks() const { 
+        DEV {
+            if( !d.dbMutex.isWriteLocked() ) { 
+                const char *n = ns();
+                if( lockStatus.whichCollection.empty() ) { 
+                    log() << "DEBUG checkLocks error expected to already be locked: " << n << endl;
+                    dassert(false);
+                }
+                dassert( subcollectionOf(lockStatus.whichCollection, n) || lkspecial(n) );
+            }
+        }
+    }
+#endif
+
+    // we don't keep these locks in the namespacedetailstransient and Database 
+    // objects -- that makes things safer as we need not prove to ourselves that they 
+    // are always in scope when we need them.
+    // todo: we don't clean these locks up yet.
+    // todo: avoiding the mutex here might be nice.
+    class LockObjectForEachCollection {
+        //mapsf<string,RWLock*> dblocks;
+        mapsf<string,RWLock*> nslocks;
+    public:
+        /*RWLock& fordb(string db) { 
+            mapsf<string,RWLock*>::ref r(dblocks);
+            RWLock*& rw = r[db];
+            if( rw == 0 )
+                rw = new RWLock(0);
+            return *rw;
+        }*/
+        RWLock& forns(string ns) { 
+            mapsf<string,RWLock*>::ref r(nslocks);
+#if defined(CLC)
+            massert(15964, str::stream() << "bad collection name to lock: " << ns, str::contains(ns, '.'));
+#endif
+            RWLock*& rw = r[ns];
+            if( rw == 0 ) { 
+                rw = new RWLock(0);
+            }
+            return *rw;
+        }
+    } theLocks;
+
+#if defined(CLC)
+    LockCollectionForWriting::Locks::Locks(string ns) : 
+        excluder(d.writeExcluder),
+        gslk(),
+        clk(theLocks.forns(ns),true)
+    { }
+    LockCollectionForWriting::~LockCollectionForWriting() { 
+        if( locks.get() ) {
+            Client::LockStatus& s = cc().lockStatus;
+            s.whichCollection.clear();
+        }
+    }
+    LockCollectionForWriting::LockCollectionForWriting(string coll)
+    {
+        Client::LockStatus& s = cc().lockStatus;
+        LockBits b(s.state);
+        if( !s.whichCollection.empty() ) {
+            if( !subcollectionOf(s.whichCollection, coll.c_str()) ) { 
+                massert(15937, str::stream() << "can't nest lock of " << coll << " beneath " << s.whichCollection, false);
+            }
+            if( b.get(LockBits::Collection) != LockBits::Exclusive ) {
+                massert(15938, str::stream() << "want collection write lock but it is already read locked " << s.state, false);
+            }
+            return;
+        }
+        verify(15965, !lkspecial(coll)); // you must global write lock for writes to special's
+        s.whichCollection = coll;
+        b.set(LockBits::Collection, LockBits::NotLocked, LockBits::Exclusive);
+        locks.reset( new Locks(coll) );
+    }    
+#endif
+
+    LockCollectionForReading::LockCollectionForReading(string ns) : 
+      gslk(),
+      clk( cc().lockStatus.collection, theLocks.forns(ns) ) 
+    { 
+        Client::LockStatus& s = cc().lockStatus;
+        if( s.whichCollection.empty() ) {
+            s.whichCollection = ns;
+        }
+        else {
+            if( !subcollectionOf(s.whichCollection, ns.c_str()) ) {
+                if( lkspecial(ns) )
+                    return;
+                massert(15939, 
+                    str::stream() << "can't nest lock of " << ns << " beneath " << s.whichCollection, 
+                    false);
+            }
+        }
+    }
+    LockCollectionForReading::~LockCollectionForReading() {
+        if( !clk.recursed() ) {
+            Client::LockStatus& s = cc().lockStatus;
+            s.whichCollection.clear();
+        }
+    }
+
+}
diff --git a/src/mongo/db/d_concurrency.h b/src/mongo/db/d_concurrency.h
new file mode 100644
index 00000000000..ba2f64f5126
--- /dev/null
+++ b/src/mongo/db/d_concurrency.h
@@ -0,0 +1,67 @@
+// @file d_concurrency.h
+
+#pragma once
+
+#include "../util/concurrency/rwlock.h"
+#include "db/mongomutex.h"
+
+namespace mongo {
+
+    namespace clcimpl {
+        enum LockStates { Unlocked, AcquireShared=1, LockedShared=2, AcquireExclusive=4, LockedExclusive=8 };
+        class Shared : boost::noncopyable { 
+            unsigned& state;
+            RWLock *rw;
+        public:
+            Shared(unsigned& state, RWLock& lock);
+            ~Shared();
+            bool recursed() const { return rw == 0; }
+        };
+        class Exclusive : boost::noncopyable { 
+            unsigned& state;
+            RWLock *rw;
+        public:
+            Exclusive(unsigned& state, RWLock& lock);
+            ~Exclusive();
+        };
+    }
+
+    typedef readlock GlobalSharedLock;
+
+    class ExcludeAllWrites : boost::noncopyable {
+        clcimpl::Exclusive lk;
+        GlobalSharedLock gslk;
+    public:
+        ExcludeAllWrites();
+        ~ExcludeAllWrites();
+    };
+
+    class todoGlobalWriteLock : boost::noncopyable { 
+    public:
+    };
+
+    class LockCollectionForReading : boost::noncopyable { 
+        GlobalSharedLock gslk;
+        clcimpl::Shared clk;
+    public:
+        LockCollectionForReading(string coll);
+        ~LockCollectionForReading();
+    };
+
+#if defined(CLC)
+    class LockCollectionForWriting : boost::noncopyable {
+        struct Locks { 
+            Locks(string ns);
+            SimpleRWLock::Shared excluder;
+            GlobalSharedLock gslk;
+            rwlock clk;
+        };
+        scoped_ptr<Locks> locks;
+    public:
+        LockCollectionForWriting(string db);
+        ~LockCollectionForWriting();
+    };
+#else
+#endif
+
+}
diff --git a/src/mongo/db/d_globals.cpp b/src/mongo/db/d_globals.cpp
new file mode 100644
index 00000000000..7e0fd9e8cb0
--- /dev/null
+++ b/src/mongo/db/d_globals.cpp
@@ -0,0 +1,20 @@
+// @file d_globals.cpp
+
+#include "pch.h"
+#include "d_globals.h"
+#include "../util/concurrency/rwlock.h"
+#include "clientcursor.h"
+#include "mongomutex.h"
+
+namespace mongo { 
+
+    DGlobals::DGlobals() :
+        writeExcluder( *(new RWLock("writeexcluder")) ),
+        dbMutex( *(new MongoMutex("dbMutex")) ),
+        clientCursorMonitor( *(new ClientCursorMonitor()) )
+    {
+    }
+
+    DGlobals d;
+
+}
diff --git a/src/mongo/db/d_globals.h b/src/mongo/db/d_globals.h
new file mode 100644
index 00000000000..7c95d463cc3
--- /dev/null
+++ b/src/mongo/db/d_globals.h
@@ -0,0 +1,27 @@
+// @file d_globals.h
+//
+// these are global variables used in mongod ("d").  also used in test binary as that is effectively a variation on mongod code.
+// that is, these are not in mongos.
+//
+
+#pragma once
+
+namespace mongo { 
+
+    class RWLock;
+    class MongoMutex;
+    class ClientCursorMonitor;
+
+    struct DGlobals : boost::noncopyable { 
+        DGlobals();
+
+        // these are intentionally never deleted:
+        RWLock& writeExcluder;
+        MongoMutex &dbMutex;
+        ClientCursorMonitor& clientCursorMonitor;
+
+    };
+
+    extern DGlobals d;
+
+};
diff --git a/src/mongo/db/database.cpp b/src/mongo/db/database.cpp
new file mode 100644
index 00000000000..2d55fd35626
--- /dev/null
+++ b/src/mongo/db/database.cpp
@@ -0,0 +1,423 @@
+// database.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "database.h"
+#include "instance.h"
+#include "clientcursor.h"
+#include "databaseholder.h"
+
+namespace mongo {
+
+    bool Database::_openAllFiles = true;
+
+    void assertDbAtLeastReadLocked(const Database *) { 
+        // temp impl
+        d.dbMutex.assertAtLeastReadLocked(); 
+    }
+
+    void assertDbWriteLocked(const Database *) { 
+        // temp impl
+        d.dbMutex.assertWriteLocked();
+    }
+
+    Database::~Database() {
+        d.dbMutex.assertWriteLocked();
+        magic = 0;
+        size_t n = _files.size();
+        for ( size_t i = 0; i < n; i++ )
+            delete _files[i];
+        if( ccByLoc.size() ) {
+            log() << "\n\n\nWARNING: ccByLoc not empty on database close! " << ccByLoc.size() << ' ' << name << endl;
+        }
+    }
+
+    Database::Database(const char *nm, bool& newDb, const string& _path )
+        : name(nm), path(_path), namespaceIndex( path, name ),
+          profileName(name + ".system.profile")
+    {
+        try {
+            {
+                // check db name is valid
+                size_t L = strlen(nm);
+                uassert( 10028 ,  "db name is empty", L > 0 );
+                uassert( 10032 ,  "db name too long", L < 64 );
+                uassert( 10029 ,  "bad db name [1]", *nm != '.' );
+                uassert( 10030 ,  "bad db name [2]", nm[L-1] != '.' );
+                uassert( 10031 ,  "bad char(s) in db name", strchr(nm, ' ') == 0 );
+            }
+            newDb = namespaceIndex.exists();
+            profile = cmdLine.defaultProfile;
+            checkDuplicateUncasedNames(true);
+            // If already exists, open.  Otherwise behave as if empty until
+            // there's a write, then open.
+            if ( ! newDb || cmdLine.defaultProfile ) {
+                namespaceIndex.init();
+                if( _openAllFiles )
+                    openAllFiles();
+            }
+            magic = 781231;
+        } catch(std::exception& e) {
+            log() << "warning database " << path << ' ' << nm << " could not be opened" << endl;
+            log() << e.what() << endl;
+            // since destructor won't be called:
+            for ( size_t i = 0; i < _files.size(); i++ )
+                delete _files[i];
+            throw;
+        }
+    }
+    
+    void Database::checkDuplicateUncasedNames(bool inholderlock) const {
+        string duplicate = duplicateUncasedName(inholderlock, name, path );
+        if ( !duplicate.empty() ) {
+            stringstream ss;
+            ss << "db already exists with different case other: [" << duplicate << "] me [" << name << "]";
+            uasserted( DatabaseDifferCaseCode , ss.str() );
+        }
+    }
+
+    /*static*/
+    string Database::duplicateUncasedName( bool inholderlock, const string &name, const string &path, set< string > *duplicates ) {
+        d.dbMutex.assertAtLeastReadLocked();
+
+        if ( duplicates ) {
+            duplicates->clear();   
+        }
+        
+        vector<string> others;
+        getDatabaseNames( others , path );
+        
+        set<string> allShortNames;
+        dbHolder().getAllShortNames( inholderlock, allShortNames );
+        
+        others.insert( others.end(), allShortNames.begin(), allShortNames.end() );
+        
+        for ( unsigned i=0; i<others.size(); i++ ) {
+
+            if ( strcasecmp( others[i].c_str() , name.c_str() ) )
+                continue;
+            
+            if ( strcmp( others[i].c_str() , name.c_str() ) == 0 )
+                continue;
+
+            if ( duplicates ) {
+                duplicates->insert( others[i] );
+            } else {
+                return others[i];
+            }
+        }
+        if ( duplicates ) {
+            return duplicates->empty() ? "" : *duplicates->begin();
+        }
+        return "";
+    }
+    
+    boost::filesystem::path Database::fileName( int n ) const {
+        stringstream ss;
+        ss << name << '.' << n;
+        boost::filesystem::path fullName;
+        fullName = boost::filesystem::path(path);
+        if ( directoryperdb )
+            fullName /= name;
+        fullName /= ss.str();
+        return fullName;
+    }
+
+    bool Database::openExistingFile( int n ) { 
+        assert(this);
+        d.dbMutex.assertWriteLocked();
+        {
+            // must not yet be visible to others as we aren't in the db's write lock and 
+            // we will write to _files vector - thus this assert.
+            bool loaded = dbHolder().__isLoaded(name, path);
+            assert( !loaded );
+        }
+        // additionally must be in the dbholder mutex (no assert for that yet)
+
+        // todo: why here? that could be bad as we may be read locked only here
+        namespaceIndex.init();
+
+        if ( n < 0 || n >= DiskLoc::MaxFiles ) {
+            massert( 15924 , str::stream() << "getFile(): bad file number value " << n << " (corrupt db?): run repair", false);
+        }
+
+        {
+            if( n < (int) _files.size() && _files[n] ) {
+                dlog(2) << "openExistingFile " << n << " is already open" << endl;
+                return true;
+            }
+        }
+
+        {
+            boost::filesystem::path fullName = fileName( n );
+            string fullNameString = fullName.string();
+            MongoDataFile *df = new MongoDataFile(n);
+            try {
+                if( !df->openExisting( fullNameString.c_str() ) ) { 
+                    delete df;
+                    return false;
+                }
+            }
+            catch ( AssertionException& ) {
+                delete df;
+                throw;
+            }
+            while ( n >= (int) _files.size() ) {
+                _files.push_back(0);
+            }
+            _files[n] = df;
+        }
+
+        return true;
+    }
+
+    // todo : we stop once a datafile dne.
+    //        if one datafile were missing we should keep going for 
+    //        repair purposes yet we do not.
+    void Database::openAllFiles() {
+        //log() << "TEMP openallfiles " << path << ' ' << name << endl;
+        assert(this);
+        int n = 0;
+        while( openExistingFile(n) ) {
+            n++;
+        }
+
+        /*
+        int n = 0;
+        while( exists(n) ) {
+            getFile(n);
+            n++;
+        }
+        // If last file is empty, consider it preallocated and make sure it's not mapped
+        // until a write is requested
+        if ( n > 1 && getFile( n - 1 )->getHeader()->isEmpty() ) {
+            delete _files[ n - 1 ];
+            _files.pop_back();
+        }*/
+    }
+
+    // todo: this is called a lot. streamline the common case
+    MongoDataFile* Database::getFile( int n, int sizeNeeded , bool preallocateOnly) {
+        assert(this);
+        DEV assertDbAtLeastReadLocked(this);
+
+        namespaceIndex.init();
+        if ( n < 0 || n >= DiskLoc::MaxFiles ) {
+            out() << "getFile(): n=" << n << endl;
+            massert( 10295 , "getFile(): bad file number value (corrupt db?): run repair", false);
+        }
+        DEV {
+            if ( n > 100 ) {
+                out() << "getFile(): n=" << n << endl;
+            }
+        }
+        MongoDataFile* p = 0;
+        if ( !preallocateOnly ) {
+            while ( n >= (int) _files.size() ) {
+                DEV if( !d.dbMutex.isWriteLocked() ) { 
+                    log() << "error: getFile() called in a read lock, yet file to return is not yet open" << endl;
+                    log() << "       getFile(" << n << ") _files.size:" <<_files.size() << ' ' << fileName(n).string() << endl;
+                    log() << "       context ns: " << cc().ns() << " openallfiles:" << _openAllFiles << endl;
+                }
+                assertDbWriteLocked(this);
+                _files.push_back(0);
+            }
+            p = _files[n];
+        }
+        if ( p == 0 ) {
+            assertDbWriteLocked(this);
+            boost::filesystem::path fullName = fileName( n );
+            string fullNameString = fullName.string();
+            p = new MongoDataFile(n);
+            int minSize = 0;
+            if ( n != 0 && _files[ n - 1 ] )
+                minSize = _files[ n - 1 ]->getHeader()->fileLength;
+            if ( sizeNeeded + DataFileHeader::HeaderSize > minSize )
+                minSize = sizeNeeded + DataFileHeader::HeaderSize;
+            try {
+                p->open( fullNameString.c_str(), minSize, preallocateOnly );
+            }
+            catch ( AssertionException& ) {
+                delete p;
+                throw;
+            }
+            if ( preallocateOnly )
+                delete p;
+            else
+                _files[n] = p;
+        }
+        return preallocateOnly ? 0 : p;
+    }
+
+    MongoDataFile* Database::addAFile( int sizeNeeded, bool preallocateNextFile ) {
+        assertDbWriteLocked(this);
+        int n = (int) _files.size();
+        MongoDataFile *ret = getFile( n, sizeNeeded );
+        if ( preallocateNextFile )
+            preallocateAFile();
+        return ret;
+    }
+
+    bool fileIndexExceedsQuota( const char *ns, int fileIndex, bool enforceQuota ) {
+        return
+            cmdLine.quota &&
+            enforceQuota &&
+            fileIndex >= cmdLine.quotaFiles &&
+            // we don't enforce the quota on "special" namespaces as that could lead to problems -- e.g.
+            // rejecting an index insert after inserting the main record.
+            !NamespaceString::special( ns ) &&
+            NamespaceString( ns ).db != "local";
+    }
+    
+    MongoDataFile* Database::suitableFile( const char *ns, int sizeNeeded, bool preallocate, bool enforceQuota ) {
+
+        // check existing files
+        for ( int i=numFiles()-1; i>=0; i-- ) {
+            MongoDataFile* f = getFile( i );
+            if ( f->getHeader()->unusedLength >= sizeNeeded ) {
+                if ( fileIndexExceedsQuota( ns, i-1, enforceQuota ) ) // NOTE i-1 is the value used historically for this check.
+                    ;
+                else
+                    return f;
+            }
+        }
+
+        if ( fileIndexExceedsQuota( ns, numFiles(), enforceQuota ) )
+            uasserted(12501, "quota exceeded");
+
+        // allocate files until we either get one big enough or hit maxSize
+        for ( int i = 0; i < 8; i++ ) {
+            MongoDataFile* f = addAFile( sizeNeeded, preallocate );
+
+            if ( f->getHeader()->unusedLength >= sizeNeeded )
+                return f;
+
+            if ( f->getHeader()->fileLength >= MongoDataFile::maxSize() ) // this is as big as they get so might as well stop
+                return f;
+        }
+
+        uasserted(14810, "couldn't allocate space (suitableFile)"); // callers don't check for null return code
+        return 0;
+    }
+
+    MongoDataFile* Database::newestFile() {
+        int n = numFiles();
+        if ( n == 0 )
+            return 0;
+        return getFile(n-1);
+    }
+
+
+    Extent* Database::allocExtent( const char *ns, int size, bool capped, bool enforceQuota ) {
+        // todo: when profiling, these may be worth logging into profile collection
+        bool fromFreeList = true;
+        Extent *e = DataFileMgr::allocFromFreeList( ns, size, capped );
+        if( e == 0 ) {
+            fromFreeList = false;
+            e = suitableFile( ns, size, !capped, enforceQuota )->createExtent( ns, size, capped );
+        }
+        LOG(1) << "allocExtent " << ns << " size " << size << ' ' << fromFreeList << endl; 
+        return e;
+    }
+
+
+    bool Database::setProfilingLevel( int newLevel , string& errmsg ) {
+        if ( profile == newLevel )
+            return true;
+
+        if ( newLevel < 0 || newLevel > 2 ) {
+            errmsg = "profiling level has to be >=0 and <= 2";
+            return false;
+        }
+
+        if ( newLevel == 0 ) {
+            profile = 0;
+            return true;
+        }
+
+        assert( cc().database() == this );
+
+        if ( ! namespaceIndex.details( profileName.c_str() ) ) {
+            log() << "creating profile collection: " << profileName << endl;
+            BSONObjBuilder spec;
+            spec.appendBool( "capped", true );
+            spec.append( "size", 1024*1024 );
+            if ( ! userCreateNS( profileName.c_str(), spec.done(), errmsg , false /* we don't replica profile messages */ ) ) {
+                return false;
+            }
+        }
+        profile = newLevel;
+        return true;
+    }
+
+    bool Database::exists(int n) const { 
+        return boost::filesystem::exists( fileName( n ) ); 
+    }
+
+    int Database::numFiles() const { 
+        DEV assertDbAtLeastReadLocked(this);
+        return (int) _files.size(); 
+    }
+
+    void Database::flushFiles( bool sync ) {
+        assertDbAtLeastReadLocked(this);
+        for( vector<MongoDataFile*>::iterator i = _files.begin(); i != _files.end(); i++ ) { 
+            MongoDataFile *f = *i;
+            f->flush(sync);
+        }
+    }
+
+    long long Database::fileSize() const {
+        long long size=0;
+        for (int n=0; exists(n); n++)
+            size += boost::filesystem::file_size( fileName(n) );
+        return size;
+    }
+
+    Database* DatabaseHolder::getOrCreate( const string& ns , const string& path , bool& justCreated ) {
+        d.dbMutex.assertAtLeastReadLocked();
+
+        DBs& m = _paths[path];
+
+        string dbname = _todb( ns );
+
+        {
+            DBs::iterator i = m.find(dbname); 
+            if( i != m.end() ) {
+                justCreated = false;
+                return i->second;
+            }
+        }
+
+        // todo: protect against getting sprayed with requests for different db names that DNE - 
+        //       that would make the DBs map very large.  not clear what to do to handle though, 
+        //       perhaps just log it, which is what we do here with the "> 40" : 
+        bool cant = !d.dbMutex.isWriteLocked();
+        if( logLevel >= 1 || m.size() > 40 || cant || DEBUG_BUILD ) {
+            log() << "opening db: " << (path==dbpath?"":path) << ' ' << dbname << endl;
+        }
+        massert(15927, "can't open database in a read lock. if db was just closed, consider retrying the query. might otherwise indicate an internal error", !cant);
+        
+        Database *db = new Database( dbname.c_str() , justCreated , path );
+        m[dbname] = db;
+        _size++;
+        return db;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/database.h b/src/mongo/db/database.h
new file mode 100644
index 00000000000..a7867e20e8c
--- /dev/null
+++ b/src/mongo/db/database.h
@@ -0,0 +1,145 @@
+// database.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "cmdline.h"
+#include "namespace.h"
+
+namespace mongo {
+
+    class Extent;
+    class MongoDataFile;
+    class ClientCursor;
+    struct ByLocKey;
+    typedef map<ByLocKey, ClientCursor*> CCByLoc;
+
+    /**
+     * Database represents a database database
+     * Each database database has its own set of files -- dbname.ns, dbname.0, dbname.1, ...
+     * NOT memory mapped
+    */
+    class Database {
+    public:
+        static bool _openAllFiles;
+
+        // you probably need to be in dbHolderMutex when constructing this
+        Database(const char *nm, /*out*/ bool& newDb, const string& _path = dbpath);
+    private:
+        ~Database(); // closes files and other cleanup see below.
+    public:
+        /* you must use this to close - there is essential code in this method that is not in the ~Database destructor.
+           thus the destructor is private.  this could be cleaned up one day...
+        */
+        static void closeDatabase( const char *db, const string& path );
+
+        void openAllFiles();
+
+        /**
+         * tries to make sure that this hasn't been deleted
+         */
+        bool isOk() const { return magic == 781231; }
+
+        bool isEmpty() { return ! namespaceIndex.allocated(); }
+
+        /**
+         * total file size of Database in bytes
+         */
+        long long fileSize() const;
+
+        int numFiles() const;
+
+        /**
+         * returns file valid for file number n
+         */
+        boost::filesystem::path fileName( int n ) const;
+
+    private:
+        bool exists(int n) const;
+        bool openExistingFile( int n );
+
+    public:
+        /**
+         * return file n.  if it doesn't exist, create it
+         */
+        MongoDataFile* getFile( int n, int sizeNeeded = 0, bool preallocateOnly = false );
+
+        MongoDataFile* addAFile( int sizeNeeded, bool preallocateNextFile );
+
+        /**
+         * makes sure we have an extra file at the end that is empty
+         * safe to call this multiple times - the implementation will only preallocate one file
+         */
+        void preallocateAFile() { getFile( numFiles() , 0, true ); }
+
+        MongoDataFile* suitableFile( const char *ns, int sizeNeeded, bool preallocate, bool enforceQuota );
+
+        Extent* allocExtent( const char *ns, int size, bool capped, bool enforceQuota );
+
+        MongoDataFile* newestFile();
+
+        /**
+         * @return true if success.  false if bad level or error creating profile ns
+         */
+        bool setProfilingLevel( int newLevel , string& errmsg );
+
+        void flushFiles( bool sync );
+
+        /**
+         * @return true if ns is part of the database
+         *         ns=foo.bar, db=foo returns true
+         */
+        bool ownsNS( const string& ns ) const {
+            if ( ! startsWith( ns , name ) )
+                return false;
+            return ns[name.size()] == '.';
+        }
+    private:
+        /**
+         * @throws DatabaseDifferCaseCode if the name is a duplicate based on
+         * case insensitive matching.
+         */
+        void checkDuplicateUncasedNames(bool inholderlockalready) const;
+    public:
+        /**
+         * @return name of an existing database with same text name but different
+         * casing, if one exists.  Otherwise the empty string is returned.  If
+         * 'duplicates' is specified, it is filled with all duplicate names.
+         */
+        static string duplicateUncasedName( bool inholderlockalready, const string &name, const string &path, set< string > *duplicates = 0 );
+
+        const string name; // "alleyinsider"
+        const string path;
+
+    private:
+
+        // must be in the dbLock when touching this (and write locked when writing to of course)
+        // however during Database object construction we aren't, which is ok as it isn't yet visible
+        //   to others and we are in the dbholder lock then.
+        vector<MongoDataFile*> _files;
+
+    public: // this should be private later
+
+        NamespaceIndex namespaceIndex;
+        int profile; // 0=off.
+        const string profileName; // "alleyinsider.system.profile"
+        CCByLoc ccByLoc;
+        int magic; // used for making sure the object is still loaded in memory
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/databaseholder.h b/src/mongo/db/databaseholder.h
new file mode 100644
index 00000000000..7c878c4ed63
--- /dev/null
+++ b/src/mongo/db/databaseholder.h
@@ -0,0 +1,126 @@
+// @file databaseholder.h
+
+#pragma once
+
+namespace mongo { 
+
+    /**
+     * path + dbname -> Database
+     */
+    class DatabaseHolder {
+        typedef map<string,Database*> DBs;
+        typedef map<string,DBs> Paths;
+    public:
+        DatabaseHolder() : _size(0) { }
+
+        bool __isLoaded( const string& ns , const string& path ) const {
+            Paths::const_iterator x = _paths.find( path );
+            if ( x == _paths.end() )
+                return false;
+            const DBs& m = x->second;
+
+            string db = _todb( ns );
+
+            DBs::const_iterator it = m.find(db);
+            return it != m.end();
+        }
+        // must be write locked as otherwise isLoaded could go false->true on you 
+        // in the background and you might not expect that.
+        bool _isLoaded( const string& ns , const string& path ) const {
+            d.dbMutex.assertWriteLocked();
+            return __isLoaded(ns,path);
+        }
+
+        Database * get( const string& ns , const string& path ) const {
+            d.dbMutex.assertAtLeastReadLocked();
+            Paths::const_iterator x = _paths.find( path );
+            if ( x == _paths.end() )
+                return 0;
+            const DBs& m = x->second;
+            string db = _todb( ns );
+            DBs::const_iterator it = m.find(db);
+            if ( it != m.end() )
+                return it->second;
+            return 0;
+        }
+
+        void _put( const string& ns , const string& path , Database * db ) {
+            d.dbMutex.assertAtLeastReadLocked();
+            DBs& m = _paths[path];
+            Database*& d = m[_todb(ns)];
+            if( d ) {
+                dlog(2) << "info dbholder put db was already set " << ns << endl;
+            }
+            else {
+                _size++;
+            }
+            d = db;
+        }
+
+        Database* getOrCreate( const string& ns , const string& path , bool& justCreated );
+
+        void erase( const string& ns , const string& path ) {
+            d.dbMutex.assertWriteLocked(); // write lock req'd as a Database obj can be in use dbHolderMutex is mainly just to control the holder itself
+            DBs& m = _paths[path];
+            _size -= (int)m.erase( _todb( ns ) );
+        }
+
+        /** @param force - force close even if something underway - use at shutdown */
+        bool closeAll( const string& path , BSONObjBuilder& result, bool force );
+
+        // "info" as this is informational only could change on you if you are not write locked
+        int sizeInfo() const { return _size; }
+
+        void forEach(boost::function<void(Database *)> f) const {
+            d.dbMutex.assertWriteLocked();
+            for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ) {
+                DBs m = i->second;
+                for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ) {
+                    f(j->second);
+                }
+            }
+        }
+
+        /**
+         * gets all unique db names, ignoring paths
+         */
+        void getAllShortNames( bool locked, set<string>& all ) const {
+            d.dbMutex.assertAtLeastReadLocked();
+            for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ) {
+                DBs m = i->second;
+                for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ) {
+                    all.insert( j->first );
+                }
+            }
+        }
+
+    private:
+        static string _todb( const string& ns ) {
+            string d = __todb( ns );
+            uassert( 13280 , (string)"invalid db name: " + ns , NamespaceString::validDBName( d ) );
+            return d;
+        }
+        static string __todb( const string& ns ) {
+            size_t i = ns.find( '.' );
+            if ( i == string::npos ) {
+                uassert( 13074 , "db name can't be empty" , ns.size() );
+                return ns;
+            }
+            uassert( 13075 , "db name can't be empty" , i > 0 );
+            return ns.substr( 0 , i );
+        }
+        Paths _paths;
+        int _size;
+    };
+
+    DatabaseHolder& dbHolderUnchecked();
+    inline const DatabaseHolder& dbHolder() { 
+        dassert( d.dbMutex.atLeastReadLocked() );
+        return dbHolderUnchecked();
+    }
+    inline DatabaseHolder& dbHolderW() { 
+        dassert( d.dbMutex.isWriteLocked() );
+        return dbHolderUnchecked();
+    }
+
+}
diff --git a/src/mongo/db/db.cpp b/src/mongo/db/db.cpp
new file mode 100644
index 00000000000..af03b447976
--- /dev/null
+++ b/src/mongo/db/db.cpp
@@ -0,0 +1,1309 @@
+// @file db.cpp : Defines main() for the mongod program.
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "introspect.h"
+#include "repl.h"
+#include "../util/unittest.h"
+#include "../util/file_allocator.h"
+#include "../util/background.h"
+#include "../util/text.h"
+#include "dbmessage.h"
+#include "instance.h"
+#include "clientcursor.h"
+#include "pdfile.h"
+#include "stats/counters.h"
+#include "repl/rs.h"
+#include "../scripting/engine.h"
+#include "module.h"
+#include "cmdline.h"
+#include "stats/snapshots.h"
+#include "../util/concurrency/task.h"
+#include "../util/version.h"
+#include "../util/ramlog.h"
+#include "../util/net/message_server.h"
+#include "client.h"
+#include "restapi.h"
+#include "dbwebserver.h"
+#include "dur.h"
+#include "concurrency.h"
+#include "../s/d_writeback.h"
+#include "d_globals.h"
+
+#if defined(_WIN32)
+# include "../util/ntservice.h"
+#else
+# include <sys/file.h>
+#endif
+
+namespace mongo {
+
+    namespace dur { 
+        extern unsigned long long DataLimitPerJournalFile;
+    }
+
+    /* only off if --nocursors which is for debugging. */
+    extern bool useCursors;
+
+    /* only off if --nohints */
+    extern bool useHints;
+
+    extern int diagLogging;
+    extern unsigned lenForNewNsFiles;
+    extern int lockFile;
+    extern bool checkNsFilesOnLoad;
+    extern string repairpath;
+
+    void setupSignals( bool inFork );
+    void startReplication();
+    void exitCleanly( ExitCode code );
+
+    CmdLine cmdLine;
+    static bool scriptingEnabled = true;
+    bool noHttpInterface = false;
+    bool shouldRepairDatabases = 0;
+    static bool forceRepair = 0;
+    Timer startupSrandTimer;
+
+    const char *ourgetns() {
+        Client *c = currentClient.get();
+        if ( ! c )
+            return "";
+        Client::Context* cc = c->getContext();
+        return cc ? cc->ns() : "";
+    }
+
+    struct MyStartupTests {
+        MyStartupTests() {
+            assert( sizeof(OID) == 12 );
+        }
+    } mystartupdbcpp;
+
+    QueryResult* emptyMoreResult(long long);
+
+
+    /* todo: make this a real test.  the stuff in dbtests/ seem to do all dbdirectclient which exhaust doesn't support yet. */
+// QueryOption_Exhaust
+#define TESTEXHAUST 0
+#if( TESTEXHAUST )
+    void testExhaust() {
+        sleepsecs(1);
+        unsigned n = 0;
+        auto f = [&n](const BSONObj& o) {
+            assert( o.valid() );
+            //cout << o << endl;
+            n++;
+            bool testClosingSocketOnError = false;
+            if( testClosingSocketOnError )
+                assert(false);
+        };
+        DBClientConnection db(false);
+        db.connect("localhost");
+        const char *ns = "local.foo";
+        if( db.count(ns) < 10000 )
+            for( int i = 0; i < 20000; i++ )
+                db.insert(ns, BSON("aaa" << 3 << "b" << "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"));
+
+        try {
+            db.query(f, ns, Query() );
+        }
+        catch(...) {
+            cout << "hmmm" << endl;
+        }
+
+        try {
+            db.query(f, ns, Query() );
+        }
+        catch(...) {
+            cout << "caught" << endl;
+        }
+
+        cout << n << endl;
+    };
+#endif
+
+    void sysRuntimeInfo() {
+        out() << "sysinfo:" << endl;
+#if defined(_SC_PAGE_SIZE)
+        out() << "  page size: " << (int) sysconf(_SC_PAGE_SIZE) << endl;
+#endif
+#if defined(_SC_PHYS_PAGES)
+        out() << "  _SC_PHYS_PAGES: " << sysconf(_SC_PHYS_PAGES) << endl;
+#endif
+#if defined(_SC_AVPHYS_PAGES)
+        out() << "  _SC_AVPHYS_PAGES: " << sysconf(_SC_AVPHYS_PAGES) << endl;
+#endif
+    }
+
+    /* if server is really busy, wait a bit */
+    void beNice() {
+        sleepmicros( Client::recommendedYieldMicros() );
+    }
+
+    class MyMessageHandler : public MessageHandler {
+    public:
+        virtual void connected( AbstractMessagingPort* p ) {
+            Client& c = Client::initThread("conn", p);
+            c.getAuthenticationInfo()->isLocalHost = p->remote().isLocalHost();
+        }
+
+        virtual void process( Message& m , AbstractMessagingPort* port , LastError * le) {
+            while ( true ) {
+                if ( inShutdown() ) {
+                    log() << "got request after shutdown()" << endl;
+                    break;
+                }
+
+                lastError.startRequest( m , le );
+
+                DbResponse dbresponse;
+                assembleResponse( m, dbresponse, port->remote() );
+
+                if ( dbresponse.response ) {
+                    port->reply(m, *dbresponse.response, dbresponse.responseTo);
+                    if( dbresponse.exhaust ) {
+                        MsgData *header = dbresponse.response->header();
+                        QueryResult *qr = (QueryResult *) header;
+                        long long cursorid = qr->cursorId;
+                        if( cursorid ) {
+                            assert( dbresponse.exhaust && *dbresponse.exhaust != 0 );
+                            string ns = dbresponse.exhaust; // before reset() free's it...
+                            m.reset();
+                            BufBuilder b(512);
+                            b.appendNum((int) 0 /*size set later in appendData()*/);
+                            b.appendNum(header->id);
+                            b.appendNum(header->responseTo);
+                            b.appendNum((int) dbGetMore);
+                            b.appendNum((int) 0);
+                            b.appendStr(ns);
+                            b.appendNum((int) 0); // ntoreturn
+                            b.appendNum(cursorid);
+                            m.appendData(b.buf(), b.len());
+                            b.decouple();
+                            DEV log() << "exhaust=true sending more" << endl;
+                            beNice();
+                            continue; // this goes back to top loop
+                        }
+                    }
+                }
+                break;
+            }
+        }
+
+        virtual void disconnected( AbstractMessagingPort* p ) {
+            Client * c = currentClient.get();
+            if( c ) c->shutdown();
+            globalScriptEngine->threadDone();
+        }
+
+    };
+
+    void listen(int port) {
+        //testTheDb();
+        MessageServer::Options options;
+        options.port = port;
+        options.ipList = cmdLine.bind_ip;
+
+        MessageServer * server = createServer( options , new MyMessageHandler() );
+        server->setAsTimeTracker();
+
+        startReplication();
+        if ( !noHttpInterface )
+            boost::thread web( boost::bind(&webServerThread, new RestAdminAccess() /* takes ownership */));
+
+#if(TESTEXHAUST)
+        boost::thread thr(testExhaust);
+#endif
+        server->run();
+    }
+
+
+    bool doDBUpgrade( const string& dbName , string errmsg , DataFileHeader * h ) {
+        static DBDirectClient db;
+
+        if ( h->version == 4 && h->versionMinor == 4 ) {
+            assert( PDFILE_VERSION == 4 );
+            assert( PDFILE_VERSION_MINOR == 5 );
+
+            list<string> colls = db.getCollectionNames( dbName );
+            for ( list<string>::iterator i=colls.begin(); i!=colls.end(); i++) {
+                string c = *i;
+                log() << "\t upgrading collection:" << c << endl;
+                BSONObj out;
+                bool ok = db.runCommand( dbName , BSON( "reIndex" << c.substr( dbName.size() + 1 ) ) , out );
+                if ( ! ok ) {
+                    errmsg = "reindex failed";
+                    log() << "\t\t reindex failed: " << out << endl;
+                    return false;
+                }
+            }
+
+            h->versionMinor = 5;
+            return true;
+        }
+
+        // do this in the general case
+        return repairDatabase( dbName.c_str(), errmsg );
+    }
+
+    // ran at startup.
+    static void repairDatabasesAndCheckVersion() {
+        //        LastError * le = lastError.get( true );
+        Client::GodScope gs;
+        log(1) << "enter repairDatabases (to check pdfile version #)" << endl;
+
+        //assert(checkNsFilesOnLoad);
+        checkNsFilesOnLoad = false; // we are mainly just checking the header - don't scan the whole .ns file for every db here.
+
+        dblock lk;
+        vector< string > dbNames;
+        getDatabaseNames( dbNames );
+        for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
+            string dbName = *i;
+            log(1) << "\t" << dbName << endl;
+            Client::Context ctx( dbName );
+            MongoDataFile *p = cc().database()->getFile( 0 );
+            DataFileHeader *h = p->getHeader();
+            if ( !h->isCurrentVersion() || forceRepair ) {
+
+                if( h->version <= 0 ) {
+                    uasserted(14026, 
+                      str::stream() << "db " << dbName << " appears corrupt pdfile version: " << h->version 
+							        << " info: " << h->versionMinor << ' ' << h->fileLength);
+                }
+
+                log() << "****" << endl;
+                log() << "****" << endl;
+                log() << "need to upgrade database " << dbName << " with pdfile version " << h->version << "." << h->versionMinor << ", "
+                      << "new version: " << PDFILE_VERSION << "." << PDFILE_VERSION_MINOR << endl;
+                if ( shouldRepairDatabases ) {
+                    // QUESTION: Repair even if file format is higher version than code?
+                    log() << "\t starting upgrade" << endl;
+                    string errmsg;
+                    assert( doDBUpgrade( dbName , errmsg , h ) );
+                }
+                else {
+                    log() << "\t Not upgrading, exiting" << endl;
+                    log() << "\t run --upgrade to upgrade dbs, then start again" << endl;
+                    log() << "****" << endl;
+                    dbexit( EXIT_NEED_UPGRADE );
+                    shouldRepairDatabases = 1;
+                    return;
+                }
+            }
+            else {
+                Database::closeDatabase( dbName.c_str(), dbpath );
+            }
+        }
+
+        log(1) << "done repairDatabases" << endl;
+
+        if ( shouldRepairDatabases ) {
+            log() << "finished checking dbs" << endl;
+            cc().shutdown();
+            dbexit( EXIT_CLEAN );
+        }
+
+        checkNsFilesOnLoad = true;
+    }
+
+    void clearTmpFiles() {
+        boost::filesystem::path path( dbpath );
+        for ( boost::filesystem::directory_iterator i( path );
+                i != boost::filesystem::directory_iterator(); ++i ) {
+            string fileName = boost::filesystem::path(*i).leaf();
+            if ( boost::filesystem::is_directory( *i ) &&
+                    fileName.length() && fileName[ 0 ] == '$' )
+                boost::filesystem::remove_all( *i );
+        }
+    }
+
+    void checkIfReplMissingFromCommandLine() {
+        if( !cmdLine.usingReplSets() ) { 
+            Client::GodScope gs;
+            DBDirectClient c;
+            unsigned long long x = 
+                c.count("local.system.replset");
+            if( x ) { 
+                log() << endl;
+                log() << "** warning: mongod started without --replSet yet " << x << " documents are present in local.system.replset" << endl;
+                log() << "**          restart with --replSet unless you are doing maintenance and no other clients are connected" << endl;
+                log() << endl;
+            }
+        }
+    }
+
+    void clearTmpCollections() {
+        writelock lk; // _openAllFiles is false at this point, so this is helpful for the query below to work as you can't open files when readlocked
+        Client::GodScope gs;
+        vector< string > toDelete;
+        DBDirectClient cli;
+        auto_ptr< DBClientCursor > c = cli.query( "local.system.namespaces", Query( fromjson( "{name:/^local.temp./}" ) ) );
+        while( c->more() ) {
+            BSONObj o = c->next();
+            toDelete.push_back( o.getStringField( "name" ) );
+        }
+        for( vector< string >::iterator i = toDelete.begin(); i != toDelete.end(); ++i ) {
+            log() << "Dropping old temporary collection: " << *i << endl;
+            cli.dropCollection( *i );
+        }
+    }
+
+    /**
+     * does background async flushes of mmapped files
+     */
+    class DataFileSync : public BackgroundJob {
+    public:
+        string name() const { return "DataFileSync"; }
+        void run() {
+            if( cmdLine.syncdelay == 0 )
+                log() << "warning: --syncdelay 0 is not recommended and can have strange performance" << endl;
+            else if( cmdLine.syncdelay == 1 )
+                log() << "--syncdelay 1" << endl;
+            else if( cmdLine.syncdelay != 60 )
+                log(1) << "--syncdelay " << cmdLine.syncdelay << endl;
+            int time_flushing = 0;
+            while ( ! inShutdown() ) {
+                _diaglog.flush();
+                if ( cmdLine.syncdelay == 0 ) {
+                    // in case at some point we add an option to change at runtime
+                    sleepsecs(5);
+                    continue;
+                }
+
+                sleepmillis( (long long) std::max(0.0, (cmdLine.syncdelay * 1000) - time_flushing) );
+
+                if ( inShutdown() ) {
+                    // occasional issue trying to flush during shutdown when sleep interrupted
+                    break;
+                }
+
+                Date_t start = jsTime();
+                int numFiles = MemoryMappedFile::flushAll( true );
+                time_flushing = (int) (jsTime() - start);
+
+                globalFlushCounters.flushed(time_flushing);
+
+                if( logLevel >= 1 || time_flushing >= 10000 ) { 
+                    log() << "flushing mmaps took " << time_flushing << "ms " << " for " << numFiles << " files" << endl;
+                }
+            }
+        }
+
+    } dataFileSync;
+
+    const char * jsInterruptCallback() {
+        // should be safe to interrupt in js code, even if we have a write lock
+        return killCurrentOp.checkForInterruptNoAssert();
+    }
+
+    unsigned jsGetInterruptSpecCallback() {
+        return cc().curop()->opNum();
+    }
+
+    void _initAndListen(int listenPort ) {
+
+        Client::initThread("initandlisten");
+
+        Database::_openAllFiles = false;
+
+        Logstream::get().addGlobalTee( new RamLog("global") );
+
+        bool is32bit = sizeof(int*) == 4;
+
+        {
+#if !defined(_WIN32)
+            pid_t pid = getpid();
+#else
+            DWORD pid=GetCurrentProcessId();
+#endif
+            Nullstream& l = log();
+            l << "MongoDB starting : pid=" << pid << " port=" << cmdLine.port << " dbpath=" << dbpath;
+            if( replSettings.master ) l << " master=" << replSettings.master;
+            if( replSettings.slave )  l << " slave=" << (int) replSettings.slave;
+            l << ( is32bit ? " 32" : " 64" ) << "-bit host=" << getHostNameCached() << endl;
+        }
+        DEV log() << "_DEBUG build (which is slower)" << endl;
+        show_warnings();
+        log() << mongodVersion() << endl;
+        printGitVersion();
+        printSysInfo();
+        printCommandLineOpts();
+
+        {
+            stringstream ss;
+            ss << endl;
+            ss << "*********************************************************************" << endl;
+            ss << " ERROR: dbpath (" << dbpath << ") does not exist." << endl;
+            ss << " Create this directory or give existing directory in --dbpath." << endl;
+            ss << " See http://www.mongodb.org/display/DOCS/Starting+and+Stopping+Mongo" << endl;
+            ss << "*********************************************************************" << endl;
+            uassert( 10296 ,  ss.str().c_str(), boost::filesystem::exists( dbpath ) );
+        }
+        {
+            stringstream ss;
+            ss << "repairpath (" << repairpath << ") does not exist";
+            uassert( 12590 ,  ss.str().c_str(), boost::filesystem::exists( repairpath ) );
+        }
+
+        acquirePathLock(forceRepair);
+        remove_all( dbpath + "/_tmp/" );
+
+        FileAllocator::get()->start();
+
+        MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( clearTmpFiles(), "clear tmp files" );
+
+        dur::startup();
+
+        if( cmdLine.durOptions & CmdLine::DurRecoverOnly )
+            return;
+
+        // comes after getDur().startup() because this reads from the database
+        clearTmpCollections();
+
+        checkIfReplMissingFromCommandLine();
+
+        Module::initAll();
+
+        if ( scriptingEnabled ) {
+            ScriptEngine::setup();
+            globalScriptEngine->setCheckInterruptCallback( jsInterruptCallback );
+            globalScriptEngine->setGetInterruptSpecCallback( jsGetInterruptSpecCallback );
+        }
+
+        repairDatabasesAndCheckVersion();
+
+        /* we didn't want to pre-open all files for the repair check above. for regular
+           operation we do for read/write lock concurrency reasons.
+        */
+        Database::_openAllFiles = true;
+
+        if ( shouldRepairDatabases )
+            return;
+
+        /* this is for security on certain platforms (nonce generation) */
+        srand((unsigned) (curTimeMicros() ^ startupSrandTimer.micros()));
+
+        snapshotThread.go();
+        d.clientCursorMonitor.go();
+        PeriodicTask::theRunner->go();
+        
+#ifndef _WIN32
+        CmdLine::launchOk();
+#endif
+        listen(listenPort);
+
+        // listen() will return when exit code closes its socket.
+        exitCleanly(EXIT_NET_ERROR);
+    }
+
+    void testPretouch();
+
+    void initAndListen(int listenPort) {
+        try { 
+            _initAndListen(listenPort); 
+        }
+        catch ( DBException &e ) {
+            log() << "exception in initAndListen: " << e.toString() << ", terminating" << endl;
+            dbexit( EXIT_UNCAUGHT );
+        }
+        catch ( std::exception &e ) {
+            log() << "exception in initAndListen std::exception: " << e.what() << ", terminating" << endl;
+            dbexit( EXIT_UNCAUGHT );
+        }
+        catch ( int& n ) {
+            log() << "exception in initAndListen int: " << n << ", terminating" << endl;
+            dbexit( EXIT_UNCAUGHT );
+        }
+        catch(...) {
+            log() << "exception in initAndListen, terminating" << endl;
+            dbexit( EXIT_UNCAUGHT );
+        }
+    }
+
+#if defined(_WIN32)
+    bool initService() {
+        ServiceController::reportStatus( SERVICE_RUNNING );
+        initAndListen( cmdLine.port );
+        return true;
+    }
+#endif
+
+} // namespace mongo
+
+using namespace mongo;
+
+#include <boost/program_options.hpp>
+#undef assert
+#define assert MONGO_assert
+
+namespace po = boost::program_options;
+
+void show_help_text(po::options_description options) {
+    show_warnings();
+    cout << options << endl;
+};
+
+/* Return error string or "" if no errors. */
+string arg_error_check(int argc, char* argv[]) {
+    return "";
+}
+
+int main(int argc, char* argv[]) {
+    static StaticObserver staticObserver;
+    doPreServerStartupInits();
+    getcurns = ourgetns;
+
+    po::options_description general_options("General options");
+#if defined(_WIN32)
+    po::options_description windows_scm_options("Windows Service Control Manager options");
+#endif
+    po::options_description replication_options("Replication options");
+    po::options_description ms_options("Master/slave options");
+    po::options_description rs_options("Replica set options");
+    po::options_description sharding_options("Sharding options");
+    po::options_description visible_options("Allowed options");
+    po::options_description hidden_options("Hidden options");
+
+    po::positional_options_description positional_options;
+
+    CmdLine::addGlobalOptions( general_options , hidden_options );
+
+    general_options.add_options()
+    ("auth", "run with security")
+    ("cpu", "periodically show cpu and iowait utilization")
+    ("dbpath", po::value<string>() , "directory for datafiles")
+    ("diaglog", po::value<int>(), "0=off 1=W 2=R 3=both 7=W+some reads")
+    ("directoryperdb", "each database will be stored in a separate directory")
+    ("journal", "enable journaling")
+    ("journalOptions", po::value<int>(), "journal diagnostic options")
+    ("journalCommitInterval", po::value<unsigned>(), "how often to group/batch commit (ms)")
+    ("ipv6", "enable IPv6 support (disabled by default)")
+    ("jsonp","allow JSONP access via http (has security implications)")
+    ("noauth", "run without security")
+    ("nohttpinterface", "disable http interface")
+    ("nojournal", "disable journaling (journaling is on by default for 64 bit)")
+    ("noprealloc", "disable data file preallocation - will often hurt performance")
+    ("noscripting", "disable scripting engine")
+    ("notablescan", "do not allow table scans")
+    ("nssize", po::value<int>()->default_value(16), ".ns file size (in MB) for new databases")
+    ("profile",po::value<int>(), "0=off 1=slow, 2=all")
+    ("quota", "limits each database to a certain number of files (8 default)")
+    ("quotaFiles", po::value<int>(), "number of files allower per db, requires --quota")
+    ("rest","turn on simple rest api")
+    ("repair", "run repair on all dbs")
+    ("repairpath", po::value<string>() , "root directory for repair files - defaults to dbpath" )
+    ("slowms",po::value<int>(&cmdLine.slowMS)->default_value(100), "value of slow for profile and console log" )
+    ("smallfiles", "use a smaller default file size")
+#if defined(__linux__)
+    ("shutdown", "kill a running server (for init scripts)")
+#endif
+    ("syncdelay",po::value<double>(&cmdLine.syncdelay)->default_value(60), "seconds between disk syncs (0=never, but not recommended)")
+    ("sysinfo", "print some diagnostic system information")
+    ("upgrade", "upgrade db if needed")
+    ;
+
+#if defined(_WIN32)
+    CmdLine::addWindowsOptions( windows_scm_options, hidden_options );
+#endif
+
+    replication_options.add_options()
+    ("oplogSize", po::value<int>(), "size limit (in MB) for op log")
+    ;
+
+    ms_options.add_options()
+    ("master", "master mode")
+    ("slave", "slave mode")
+    ("source", po::value<string>(), "when slave: specify master as <server:port>")
+    ("only", po::value<string>(), "when slave: specify a single database to replicate")
+    ("slavedelay", po::value<int>(), "specify delay (in seconds) to be used when applying master ops to slave")
+    ("autoresync", "automatically resync if slave data is stale")
+    ;
+
+    rs_options.add_options()
+    ("replSet", po::value<string>(), "arg is <setname>[/<optionalseedhostlist>]")
+    ;
+
+    sharding_options.add_options()
+    ("configsvr", "declare this is a config db of a cluster; default port 27019; default dir /data/configdb")
+    ("shardsvr", "declare this is a shard db of a cluster; default port 27018")
+    ("noMoveParanoia" , "turn off paranoid saving of data for moveChunk.  this is on by default for now, but default will switch" )
+    ;
+
+    hidden_options.add_options()
+    ("fastsync", "indicate that this instance is starting from a dbpath snapshot of the repl peer")
+    ("pretouch", po::value<int>(), "n pretouch threads for applying replicationed operations") // experimental
+    ("command", po::value< vector<string> >(), "command")
+    ("cacheSize", po::value<long>(), "cache size (in MB) for rec store")
+    ("nodur", "disable journaling")
+    // things we don't want people to use
+    ("nocursors", "diagnostic/debugging option that turns off cursors DO NOT USE IN PRODUCTION")
+    ("nohints", "ignore query hints")
+    ("nopreallocj", "don't preallocate journal files")
+    ("dur", "enable journaling") // old name for --journal
+    ("durOptions", po::value<int>(), "durability diagnostic options") // deprecated name
+    // deprecated pairing command line options
+    ("pairwith", "DEPRECATED")
+    ("arbiter", "DEPRECATED")
+    ("opIdMem", "DEPRECATED")
+    ;
+
+
+    positional_options.add("command", 3);
+    visible_options.add(general_options);
+#if defined(_WIN32)
+    visible_options.add(windows_scm_options);
+#endif
+    visible_options.add(replication_options);
+    visible_options.add(ms_options);
+    visible_options.add(rs_options);
+    visible_options.add(sharding_options);
+    Module::addOptions( visible_options );
+
+    setupCoreSignals();
+    setupSignals( false );
+
+    dbExecCommand = argv[0];
+
+    srand(curTimeMicros());
+#if( BOOST_VERSION >= 104500 )
+    boost::filesystem::path::default_name_check( boost::filesystem2::no_check );
+#else
+    boost::filesystem::path::default_name_check( boost::filesystem::no_check );
+#endif
+
+    {
+        unsigned x = 0x12345678;
+        unsigned char& b = (unsigned char&) x;
+        if ( b != 0x78 ) {
+            out() << "big endian cpus not yet supported" << endl;
+            return 33;
+        }
+    }
+
+    if( argc == 1 )
+        cout << dbExecCommand << " --help for help and startup options" << endl;
+
+    {
+        po::variables_map params;
+
+        string error_message = arg_error_check(argc, argv);
+        if (error_message != "") {
+            cout << error_message << endl << endl;
+            show_help_text(visible_options);
+            return 0;
+        }
+
+        if ( ! CmdLine::store( argc , argv , visible_options , hidden_options , positional_options , params ) )
+            return 0;
+
+        if (params.count("help")) {
+            show_help_text(visible_options);
+            return 0;
+        }
+        if (params.count("version")) {
+            cout << mongodVersion() << endl;
+            printGitVersion();
+            return 0;
+        }
+        if ( params.count( "dbpath" ) ) {
+            dbpath = params["dbpath"].as<string>();
+            if ( params.count( "fork" ) && dbpath[0] != '/' ) {
+                // we need to change dbpath if we fork since we change
+                // cwd to "/"
+                // fork only exists on *nix
+                // so '/' is safe 
+                dbpath = cmdLine.cwd + "/" + dbpath;
+            }
+        }
+        else {
+            dbpath = "/data/db/";
+        }
+#ifdef _WIN32
+        if (dbpath.size() > 1 && dbpath[dbpath.size()-1] == '/') {
+            // size() check is for the unlikely possibility of --dbpath "/"
+            dbpath = dbpath.erase(dbpath.size()-1);
+        }
+#endif
+
+        if ( params.count("directoryperdb")) {
+            directoryperdb = true;
+        }
+        if (params.count("cpu")) {
+            cmdLine.cpu = true;
+        }
+        if (params.count("noauth")) {
+            noauth = true;
+        }
+        if (params.count("auth")) {
+            noauth = false;
+        }
+        if (params.count("quota")) {
+            cmdLine.quota = true;
+        }
+        if (params.count("quotaFiles")) {
+            cmdLine.quota = true;
+            cmdLine.quotaFiles = params["quotaFiles"].as<int>() - 1;
+        }
+        bool journalExplicit = false;
+        if( params.count("nodur") || params.count( "nojournal" ) ) {
+            journalExplicit = true;
+            cmdLine.dur = false;
+        }
+        if( params.count("dur") || params.count( "journal" ) ) {
+            if (journalExplicit) {
+                log() << "Can't specify both --journal and --nojournal options." << endl;
+                return EXIT_BADOPTIONS;
+            }
+            journalExplicit = true;
+            cmdLine.dur = true;
+        }
+        if (params.count("durOptions")) {
+            cmdLine.durOptions = params["durOptions"].as<int>();
+        }
+        if( params.count("journalCommitInterval") ) { 
+            // don't check if dur is false here as many will just use the default, and will default to off on win32. 
+            // ie no point making life a little more complex by giving an error on a dev environment.
+            cmdLine.journalCommitInterval = params["journalCommitInterval"].as<unsigned>();
+            if( cmdLine.journalCommitInterval <= 1 || cmdLine.journalCommitInterval > 300 ) {
+                out() << "--journalCommitInterval out of allowed range (0-300ms)" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+        }
+        if (params.count("journalOptions")) {
+            cmdLine.durOptions = params["journalOptions"].as<int>();
+        }
+        if (params.count("repairpath")) {
+            repairpath = params["repairpath"].as<string>();
+            if (!repairpath.size()) {
+                out() << "repairpath is empty" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+        }
+        if (params.count("nocursors")) {
+            useCursors = false;
+        }
+        if (params.count("nohints")) {
+            useHints = false;
+        }
+        if (params.count("nopreallocj")) {
+            cmdLine.preallocj = false;
+        }
+        if (params.count("nohttpinterface")) {
+            noHttpInterface = true;
+        }
+        if (params.count("rest")) {
+            cmdLine.rest = true;
+        }
+        if (params.count("jsonp")) {
+            cmdLine.jsonp = true;
+        }
+        if (params.count("noscripting")) {
+            scriptingEnabled = false;
+        }
+        if (params.count("noprealloc")) {
+            cmdLine.prealloc = false;
+            cout << "note: noprealloc may hurt performance in many applications" << endl;
+        }
+        if (params.count("smallfiles")) {
+            cmdLine.smallfiles = true;
+            assert( dur::DataLimitPerJournalFile >= 128 * 1024 * 1024 );
+            dur::DataLimitPerJournalFile = 128 * 1024 * 1024;
+        }
+        if (params.count("diaglog")) {
+            int x = params["diaglog"].as<int>();
+            if ( x < 0 || x > 7 ) {
+                out() << "can't interpret --diaglog setting" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            _diaglog.setLevel(x);
+        }
+        if (params.count("sysinfo")) {
+            sysRuntimeInfo();
+            return 0;
+        }
+        if (params.count("repair")) {
+            Record::MemoryTrackingEnabled = false;
+            shouldRepairDatabases = 1;
+            forceRepair = 1;
+        }
+        if (params.count("upgrade")) {
+            Record::MemoryTrackingEnabled = false;
+            shouldRepairDatabases = 1;
+        }
+        if (params.count("notablescan")) {
+            cmdLine.noTableScan = true;
+        }
+        if (params.count("master")) {
+            replSettings.master = true;
+        }
+        if (params.count("slave")) {
+            replSettings.slave = SimpleSlave;
+        }
+        if (params.count("slavedelay")) {
+            replSettings.slavedelay = params["slavedelay"].as<int>();
+        }
+        if (params.count("fastsync")) {
+            replSettings.fastsync = true;
+        }
+        if (params.count("autoresync")) {
+            replSettings.autoresync = true;
+            if( params.count("replSet") ) {
+                out() << "--autoresync is not used with --replSet" << endl;
+                out() << "see http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+        }
+        if (params.count("source")) {
+            /* specifies what the source in local.sources should be */
+            cmdLine.source = params["source"].as<string>().c_str();
+        }
+        if( params.count("pretouch") ) {
+            cmdLine.pretouch = params["pretouch"].as<int>();
+        }
+        if (params.count("replSet")) {
+            if (params.count("slavedelay")) {
+                out() << "--slavedelay cannot be used with --replSet" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            else if (params.count("only")) {
+                out() << "--only cannot be used with --replSet" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            /* seed list of hosts for the repl set */
+            cmdLine._replSet = params["replSet"].as<string>().c_str();
+        }
+        if (params.count("only")) {
+            cmdLine.only = params["only"].as<string>().c_str();
+        }
+        if( params.count("nssize") ) {
+            int x = params["nssize"].as<int>();
+            if (x <= 0 || x > (0x7fffffff/1024/1024)) {
+                out() << "bad --nssize arg" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            lenForNewNsFiles = x * 1024 * 1024;
+            assert(lenForNewNsFiles > 0);
+        }
+        if (params.count("oplogSize")) {
+            long long x = params["oplogSize"].as<int>();
+            if (x <= 0) {
+                out() << "bad --oplogSize arg" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            // note a small size such as x==1 is ok for an arbiter.
+            if( x > 1000 && sizeof(void*) == 4 ) {
+                out() << "--oplogSize of " << x << "MB is too big for 32 bit version. Use 64 bit build instead." << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            cmdLine.oplogSize = x * 1024 * 1024;
+            assert(cmdLine.oplogSize > 0);
+        }
+        if (params.count("cacheSize")) {
+            long x = params["cacheSize"].as<long>();
+            if (x <= 0) {
+                out() << "bad --cacheSize arg" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+            log() << "--cacheSize option not currently supported" << endl;
+        }
+        if (params.count("port") == 0 ) {
+            if( params.count("configsvr") ) {
+                cmdLine.port = CmdLine::ConfigServerPort;
+            }
+            if( params.count("shardsvr") ) {
+                if( params.count("configsvr") ) {
+                    log() << "can't do --shardsvr and --configsvr at the same time" << endl;
+                    dbexit( EXIT_BADOPTIONS );
+                }
+                cmdLine.port = CmdLine::ShardServerPort;
+            }
+        }
+        else {
+            if ( cmdLine.port <= 0 || cmdLine.port > 65535 ) {
+                out() << "bad --port number" << endl;
+                dbexit( EXIT_BADOPTIONS );
+            }
+        }
+        if ( params.count("configsvr" ) ) {
+            cmdLine.configsvr = true;
+            if (cmdLine.usingReplSets() || replSettings.master || replSettings.slave) {
+                log() << "replication should not be enabled on a config server" << endl;
+                ::exit(-1);
+            }
+            if ( params.count( "nodur" ) == 0 && params.count( "nojournal" ) == 0 )
+                cmdLine.dur = true;
+            if ( params.count( "dbpath" ) == 0 )
+                dbpath = "/data/configdb";
+        }
+        if ( params.count( "profile" ) ) {
+            cmdLine.defaultProfile = params["profile"].as<int>();
+        }
+        if (params.count("ipv6")) {
+            enableIPv6();
+        }
+        if (params.count("noMoveParanoia")) {
+            cmdLine.moveParanoia = false;
+        }
+        if (params.count("pairwith") || params.count("arbiter") || params.count("opIdMem")) {
+            out() << "****" << endl;
+            out() << "Replica Pairs have been deprecated. Invalid options: --pairwith, --arbiter, and/or --opIdMem" << endl;
+            out() << "<http://www.mongodb.org/display/DOCS/Replica+Pairs>" << endl;
+            out() << "****" << endl;
+            dbexit( EXIT_BADOPTIONS );
+        }
+
+        // needs to be after things like --configsvr parsing, thus here.
+        if( repairpath.empty() )
+            repairpath = dbpath;
+
+        Module::configAll( params );
+        dataFileSync.go();
+
+        if (params.count("command")) {
+            vector<string> command = params["command"].as< vector<string> >();
+
+            if (command[0].compare("run") == 0) {
+                if (command.size() > 1) {
+                    cout << "Too many parameters to 'run' command" << endl;
+                    cout << visible_options << endl;
+                    return 0;
+                }
+
+                initAndListen(cmdLine.port);
+                return 0;
+            }
+
+            if (command[0].compare("dbpath") == 0) {
+                cout << dbpath << endl;
+                return 0;
+            }
+
+            cout << "Invalid command: " << command[0] << endl;
+            cout << visible_options << endl;
+            return 0;
+        }
+
+        if( cmdLine.pretouch )
+            log() << "--pretouch " << cmdLine.pretouch << endl;
+
+#ifdef __linux__
+        if (params.count("shutdown")){
+            bool failed = false;
+
+            string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string();
+            if ( !boost::filesystem::exists( name ) || boost::filesystem::file_size( name ) == 0 )
+                failed = true;
+
+            pid_t pid;
+            string procPath;
+            if (!failed){
+                try {
+                    ifstream f (name.c_str());
+                    f >> pid;
+                    procPath = (str::stream() << "/proc/" << pid);
+                    if (!boost::filesystem::exists(procPath))
+                        failed = true;
+
+                    string exePath = procPath + "/exe";
+                    if (boost::filesystem::exists(exePath)){
+                        char buf[256];
+                        int ret = readlink(exePath.c_str(), buf, sizeof(buf)-1);
+                        buf[ret] = '\0'; // readlink doesn't terminate string
+                        if (ret == -1) {
+                            int e = errno;
+                            cerr << "Error resolving " << exePath << ": " << errnoWithDescription(e);
+                            failed = true;
+                        }
+                        else if (!endsWith(buf, "mongod")){
+                            cerr << "Process " << pid << " is running " << buf << " not mongod" << endl;
+                            ::exit(-1);
+                        }
+                    }
+                }
+                catch (const std::exception& e){
+                    cerr << "Error reading pid from lock file [" << name << "]: " << e.what() << endl;
+                    failed = true;
+                }
+            }
+
+            if (failed) {
+                cerr << "There doesn't seem to be a server running with dbpath: " << dbpath << endl;
+                ::exit(-1);
+            }
+
+            cout << "killing process with pid: " << pid << endl;
+            int ret = kill(pid, SIGTERM);
+            if (ret) {
+                int e = errno;
+                cerr << "failed to kill process: " << errnoWithDescription(e) << endl;
+                ::exit(-1);
+            }
+
+            while (boost::filesystem::exists(procPath)) {
+                sleepsecs(1);
+            }
+
+            ::exit(0);
+        }
+#endif
+
+#if defined(_WIN32)
+        if (serviceParamsCheck( params, dbpath, argc, argv )) {
+            return 0;
+        }
+#endif
+
+
+        if (sizeof(void*) == 4 && !journalExplicit){
+            // trying to make this stand out more like startup warnings
+            log() << endl;
+            warning() << "32-bit servers don't have journaling enabled by default. Please use --journal if you want durability." << endl;
+            log() << endl;
+        }
+
+    }
+
+    UnitTest::runTests();
+    initAndListen(cmdLine.port);
+    dbexit(EXIT_CLEAN);
+    return 0;
+}
+
+namespace mongo {
+
+    string getDbContext();
+
+#undef out
+
+
+#if !defined(_WIN32)
+
+} // namespace mongo
+
+#include <signal.h>
+#include <string.h>
+
+namespace mongo {
+
+    void pipeSigHandler( int signal ) {
+#ifdef psignal
+        psignal( signal, "Signal Received : ");
+#else
+        cout << "got pipe signal:" << signal << endl;
+#endif
+    }
+
+    void abruptQuit(int x) {
+        ostringstream ossSig;
+        ossSig << "Got signal: " << x << " (" << strsignal( x ) << ")." << endl;
+        rawOut( ossSig.str() );
+
+        /*
+        ostringstream ossOp;
+        ossOp << "Last op: " << currentOp.infoNoauth() << endl;
+        rawOut( ossOp.str() );
+        */
+
+        ostringstream oss;
+        oss << "Backtrace:" << endl;
+        printStackTrace( oss );
+        rawOut( oss.str() );
+
+        // Don't go through normal shutdown procedure. It may make things worse.
+        ::exit(EXIT_ABRUPT);
+
+    }
+
+    void abruptQuitWithAddrSignal( int signal, siginfo_t *siginfo, void * ) {
+        ostringstream oss;
+        oss << "Invalid";
+        if ( signal == SIGSEGV || signal == SIGBUS ) {
+            oss << " access";
+        } else {
+            oss << " operation";   
+        }
+        oss << " at address: " << siginfo->si_addr << endl;
+        rawOut( oss.str() );
+        abruptQuit( signal );   
+    }
+        
+    sigset_t asyncSignals;
+    // The above signals will be processed by this thread only, in order to
+    // ensure the db and log mutexes aren't held.
+    void interruptThread() {
+        int x;
+        sigwait( &asyncSignals, &x );
+        log() << "got kill or ctrl c or hup signal " << x << " (" << strsignal( x ) << "), will terminate after current cmd ends" << endl;
+        Client::initThread( "interruptThread" );
+        exitCleanly( EXIT_KILL );
+    }
+
+    // this will be called in certain c++ error cases, for example if there are two active
+    // exceptions
+    void myterminate() {
+        rawOut( "terminate() called, printing stack:" );
+        printStackTrace();
+        ::abort();
+    }
+
+    // this gets called when new fails to allocate memory
+    void my_new_handler() {
+        rawOut( "out of memory, printing stack and exiting:" );
+        printStackTrace();
+        ::exit(EXIT_ABRUPT);
+    }
+
+    void setupSignals_ignoreHelper( int signal ) {}
+
+    void setupSignals( bool inFork ) {
+        struct sigaction addrSignals;
+        memset( &addrSignals, 0, sizeof( struct sigaction ) );
+        addrSignals.sa_sigaction = abruptQuitWithAddrSignal;
+        sigemptyset( &addrSignals.sa_mask );
+        addrSignals.sa_flags = SA_SIGINFO;
+       
+        assert( sigaction(SIGSEGV, &addrSignals, 0) == 0 );
+        assert( sigaction(SIGBUS, &addrSignals, 0) == 0 );
+        assert( sigaction(SIGILL, &addrSignals, 0) == 0 );
+        assert( sigaction(SIGFPE, &addrSignals, 0) == 0 );
+        
+        assert( signal(SIGABRT, abruptQuit) != SIG_ERR );
+        assert( signal(SIGQUIT, abruptQuit) != SIG_ERR );
+        assert( signal(SIGPIPE, pipeSigHandler) != SIG_ERR );
+
+        setupSIGTRAPforGDB();
+
+        sigemptyset( &asyncSignals );
+
+        if ( inFork )
+            assert( signal( SIGHUP , setupSignals_ignoreHelper ) != SIG_ERR );
+        else
+            sigaddset( &asyncSignals, SIGHUP );
+
+        sigaddset( &asyncSignals, SIGINT );
+        sigaddset( &asyncSignals, SIGTERM );
+        assert( pthread_sigmask( SIG_SETMASK, &asyncSignals, 0 ) == 0 );
+        boost::thread it( interruptThread );
+
+        set_terminate( myterminate );
+        set_new_handler( my_new_handler );
+    }
+
+#else
+    void consoleTerminate( const char* controlCodeName ) {
+        Client::initThread( "consoleTerminate" );
+        log() << "got " << controlCodeName << ", will terminate after current cmd ends" << endl;
+        exitCleanly( EXIT_KILL );
+    }
+
+    BOOL CtrlHandler( DWORD fdwCtrlType ) {
+
+        switch( fdwCtrlType ) {
+
+        case CTRL_C_EVENT:
+            rawOut( "Ctrl-C signal" );
+            consoleTerminate( "CTRL_C_EVENT" );
+            return TRUE ;
+
+        case CTRL_CLOSE_EVENT:
+            rawOut( "CTRL_CLOSE_EVENT signal" );
+            consoleTerminate( "CTRL_CLOSE_EVENT" );
+            return TRUE ;
+
+        case CTRL_BREAK_EVENT:
+            rawOut( "CTRL_BREAK_EVENT signal" );
+            consoleTerminate( "CTRL_BREAK_EVENT" );
+            return TRUE;
+
+        case CTRL_LOGOFF_EVENT:
+            rawOut( "CTRL_LOGOFF_EVENT signal" );
+            consoleTerminate( "CTRL_LOGOFF_EVENT" );
+            return TRUE;
+
+        case CTRL_SHUTDOWN_EVENT:
+            rawOut( "CTRL_SHUTDOWN_EVENT signal" );
+            consoleTerminate( "CTRL_SHUTDOWN_EVENT" );
+            return TRUE;
+
+        default:
+            return FALSE;
+        }
+    }
+
+    LPTOP_LEVEL_EXCEPTION_FILTER filtLast = 0;
+    ::HANDLE standardOut = GetStdHandle(STD_OUTPUT_HANDLE);
+    LONG WINAPI exceptionFilter(struct _EXCEPTION_POINTERS *ExceptionInfo) { 
+        {
+            // given the severity of the event we write to console in addition to the --logFile
+            // (rawOut writes to the logfile, if a special one were specified)
+            DWORD written;
+            WriteFile(standardOut, "unhandled windows exception\n", 20, &written, 0);
+            FlushFileBuffers(standardOut);
+        }
+
+        DWORD ec = ExceptionInfo->ExceptionRecord->ExceptionCode;
+        if( ec == EXCEPTION_ACCESS_VIOLATION ) {
+            rawOut("access violation");
+        } 
+        else {
+            rawOut("unhandled windows exception");
+            char buf[64];
+            strcpy(buf, "ec=0x");
+            _ui64toa(ec, buf+5, 16);
+            rawOut(buf);
+        }
+        if( filtLast ) 
+            return filtLast(ExceptionInfo);
+        return EXCEPTION_EXECUTE_HANDLER;
+    }
+
+    // called by mongoAbort()
+    extern void (*reportEventToSystem)(const char *msg);
+    void reportEventToSystemImpl(const char *msg) { 
+        static ::HANDLE hEventLog = RegisterEventSource( NULL, TEXT("mongod") );
+        if( hEventLog ) { 
+            std::wstring s = toNativeString(msg);
+            LPCTSTR txt = s.c_str();
+            BOOL ok = ReportEvent(
+              hEventLog, EVENTLOG_ERROR_TYPE, 
+              0, 0, NULL,
+              1, 
+              0, 
+              &txt,
+              0);
+            wassert(ok);
+        }
+    }
+
+    void myPurecallHandler() {
+        printStackTrace();
+        mongoAbort("pure virtual");
+    }
+
+    void setupSignals( bool inFork ) {
+        reportEventToSystem = reportEventToSystemImpl;
+        filtLast = SetUnhandledExceptionFilter(exceptionFilter);
+        massert(10297 , "Couldn't register Windows Ctrl-C handler", SetConsoleCtrlHandler((PHANDLER_ROUTINE) CtrlHandler, TRUE));
+        _set_purecall_handler( myPurecallHandler );
+    }
+
+#endif
+
+} // namespace mongo
diff --git a/src/mongo/db/db.h b/src/mongo/db/db.h
new file mode 100644
index 00000000000..6a31a06f77c
--- /dev/null
+++ b/src/mongo/db/db.h
@@ -0,0 +1,120 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "../util/net/message.h"
+#include "concurrency.h"
+#include "pdfile.h"
+#include "curop.h"
+#include "client.h"
+#include "databaseholder.h"
+
+namespace mongo {
+
+    struct dbtemprelease {
+        Client::Context * _context;
+        int _locktype;
+
+        dbtemprelease() {
+            const Client& c = cc();
+            _context = c.getContext();
+            _locktype = d.dbMutex.getState();
+            assert( _locktype );
+
+            if ( _locktype > 0 ) {
+                massert( 10298 , "can't temprelease nested write lock", _locktype == 1);
+                if ( _context ) _context->unlocked();
+                d.dbMutex.unlock();
+            }
+            else {
+                massert( 10299 , "can't temprelease nested read lock", _locktype == -1);
+                if ( _context ) _context->unlocked();
+                d.dbMutex.unlock_shared();
+            }
+            
+            verify( 14814 , c.curop() );
+            c.curop()->yielded();
+            
+        }
+        ~dbtemprelease() {
+            if ( _locktype > 0 )
+                d.dbMutex.lock();
+            else
+                d.dbMutex.lock_shared();
+
+            if ( _context ) _context->relocked();
+        }
+    };
+
+    /** must be write locked
+        no assert (and no release) if nested write lock 
+        a lot like dbtempreleasecond but no malloc so should be a tiny bit faster
+    */
+    struct dbtempreleasewritelock {
+        Client::Context * _context;
+        int _locktype;
+        dbtempreleasewritelock() {
+            const Client& c = cc();
+            _context = c.getContext();
+            _locktype = d.dbMutex.getState();
+            assert( _locktype >= 1 );
+            if( _locktype > 1 ) 
+                return; // nested
+            if ( _context ) 
+                _context->unlocked();
+            d.dbMutex.unlock();
+            verify( 14845 , c.curop() );
+            c.curop()->yielded();            
+        }
+        ~dbtempreleasewritelock() {
+            if ( _locktype == 1 )
+                d.dbMutex.lock();
+            if ( _context ) 
+                _context->relocked();
+        }
+    };
+
+    /**
+       only does a temp release if we're not nested and have a lock
+     */
+    struct dbtempreleasecond {
+        dbtemprelease * real;
+        int locktype;
+
+        dbtempreleasecond() {
+            real = 0;
+            locktype = d.dbMutex.getState();
+            if ( locktype == 1 || locktype == -1 )
+                real = new dbtemprelease();
+        }
+
+        ~dbtempreleasecond() {
+            if ( real ) {
+                delete real;
+                real = 0;
+            }
+        }
+
+        bool unlocked() {
+            return real != 0;
+        }
+    };
+
+} // namespace mongo
+
+#include "concurrency.h"
diff --git a/src/mongo/db/db.rc b/src/mongo/db/db.rc
new file mode 100755
index 00000000000..b589458cf73
--- /dev/null
+++ b/src/mongo/db/db.rc
@@ -0,0 +1,12 @@
+// Microsoft Visual C++ generated resource script.
+//
+#include "resource.h"
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Icon
+//
+// Icon with lowest ID value placed first to ensure application icon
+// remains consistent on all systems.
+IDI_ICON2               ICON                    "mongo.ico"
+/////////////////////////////////////////////////////////////////////////////
+\ No newline at end of file
diff --git a/src/mongo/db/db.vcxproj b/src/mongo/db/db.vcxproj
new file mode 100755
index 00000000000..8963f0af580
--- /dev/null
+++ b/src/mongo/db/db.vcxproj
@@ -0,0 +1,934 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectName>mongod</ProjectName>
+    <ProjectGuid>{215B2D68-0A70-4D10-8E75-B31010C62A91}</ProjectGuid>
+    <RootNamespace>db</RootNamespace>
+    <Keyword>Win32Proj</Keyword>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <UseOfAtl>false</UseOfAtl>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <UseOfAtl>false</UseOfAtl>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</LinkIncremental>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Configuration)\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.;..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..;$(IncludePath)</IncludePath>
+    <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..;$(IncludePath)</IncludePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>No</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>No</MinimalRebuild>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+      <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;;;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>No</MinimalRebuild>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <TargetMachine>MachineX86</TargetMachine>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>;;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <MinimalRebuild>No</MinimalRebuild>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>ws2_32.lib;psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+    </Link>
+    <PreBuildEvent>
+      <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+      <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+    </PreBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\bson\oid.cpp" />
+    <ClCompile Include="..\client\dbclientcursor.cpp" />
+    <ClCompile Include="..\client\dbclient_rs.cpp" />
+    <ClCompile Include="..\client\distlock.cpp" />
+    <ClCompile Include="..\client\model.cpp" />
+    <ClCompile Include="..\s\default_version.cpp" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcrecpp.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_chartables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_compile.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_config.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_dfa_exec.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_exec.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_fullinfo.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_get.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_globals.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_info.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_maketables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_newline.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ord2utf8.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_refcount.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_scanner.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_stringpiece.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_study.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_tables.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_try_flipped.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ucp_searchfuncs.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_valid_utf8.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_version.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_xclass.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\pcre-7.4\pcreposix.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\scripting\bench.cpp" />
+    <ClCompile Include="..\shell\mongo.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\s\chunk.cpp" />
+    <ClCompile Include="..\s\config.cpp" />
+    <ClCompile Include="..\s\d_chunk_manager.cpp" />
+    <ClCompile Include="..\s\d_migrate.cpp" />
+    <ClCompile Include="..\s\d_split.cpp" />
+    <ClCompile Include="..\s\d_state.cpp" />
+    <ClCompile Include="..\s\d_writeback.cpp" />
+    <ClCompile Include="..\s\grid.cpp" />
+    <ClCompile Include="..\s\shard.cpp" />
+    <ClCompile Include="..\s\shardconnection.cpp" />
+    <ClCompile Include="..\s\shardkey.cpp" />
+    <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy.cc">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\alignedbuilder.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\compress.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\concurrency\spin_lock.cpp" />
+    <ClCompile Include="..\util\concurrency\synchronization.cpp" />
+    <ClCompile Include="..\util\concurrency\task.cpp" />
+    <ClCompile Include="..\util\concurrency\thread_pool.cpp" />
+    <ClCompile Include="..\util\concurrency\vars.cpp" />
+    <ClCompile Include="..\util\file_allocator.cpp" />
+    <ClCompile Include="..\util\intrusive_counter.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\log.cpp" />
+    <ClCompile Include="..\util\logfile.cpp" />
+    <ClCompile Include="..\util\net\listen.cpp" />
+    <ClCompile Include="..\util\net\miniwebserver.cpp" />
+    <ClCompile Include="..\util\processinfo.cpp" />
+    <ClCompile Include="..\util\ramlog.cpp" />
+    <ClCompile Include="..\util\stringutils.cpp" />
+    <ClCompile Include="..\util\systeminfo_win32.cpp" />
+    <ClCompile Include="..\util\text.cpp" />
+    <ClCompile Include="..\util\version.cpp" />
+    <ClCompile Include="btreebuilder.cpp" />
+    <ClCompile Include="cap.cpp" />
+    <ClCompile Include="commands\cloud.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="commands\distinct.cpp">
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="commands\document_source_cursor.cpp" />
+    <ClCompile Include="commands\find_and_modify.cpp" />
+    <ClCompile Include="commands\group.cpp" />
+    <ClCompile Include="commands\isself.cpp" />
+    <ClCompile Include="commands\mr.cpp" />
+    <ClCompile Include="commands\pipeline_command.cpp" />
+    <ClCompile Include="commands\pipeline.cpp" />
+    <ClCompile Include="compact.cpp" />
+    <ClCompile Include="curop.cpp" />
+    <ClCompile Include="dbcommands_generic.cpp" />
+    <ClCompile Include="dbmessage.cpp" />
+    <ClCompile Include="dur.cpp" />
+    <ClCompile Include="durop.cpp" />
+    <ClCompile Include="dur_commitjob.cpp" />
+    <ClCompile Include="dur_journal.cpp" />
+    <ClCompile Include="dur_preplogbuffer.cpp" />
+    <ClCompile Include="dur_recover.cpp" />
+    <ClCompile Include="dur_writetodatafiles.cpp" />
+    <ClCompile Include="d_concurrency.cpp" />
+    <ClCompile Include="d_globals.cpp" />
+    <ClCompile Include="geo\2d.cpp" />
+    <ClCompile Include="geo\haystack.cpp" />
+    <ClCompile Include="key.cpp" />
+    <ClCompile Include="mongommf.cpp" />
+    <ClCompile Include="oplog.cpp" />
+    <ClCompile Include="ops\count.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="ops\delete.cpp" />
+    <ClCompile Include="ops\query.cpp" />
+    <ClCompile Include="ops\update.cpp" />
+    <ClCompile Include="pagefault.cpp" />
+    <ClCompile Include="pipeline\accumulator.cpp" />
+    <ClCompile Include="pipeline\accumulator_add_to_set.cpp" />
+    <ClCompile Include="pipeline\accumulator_avg.cpp" />
+    <ClCompile Include="pipeline\accumulator_first.cpp" />
+    <ClCompile Include="pipeline\accumulator_last.cpp" />
+    <ClCompile Include="pipeline\accumulator_min_max.cpp" />
+    <ClCompile Include="pipeline\accumulator_push.cpp" />
+    <ClCompile Include="pipeline\accumulator_single_value.cpp" />
+    <ClCompile Include="pipeline\accumulator_sum.cpp" />
+    <ClCompile Include="pipeline\builder.cpp" />
+    <ClCompile Include="pipeline\document.cpp" />
+    <ClCompile Include="pipeline\document_source.cpp" />
+    <ClCompile Include="pipeline\document_source_bson_array.cpp" />
+    <ClCompile Include="pipeline\document_source_command_futures.cpp" />
+    <ClCompile Include="pipeline\document_source_filter.cpp" />
+    <ClCompile Include="pipeline\document_source_filter_base.cpp" />
+    <ClCompile Include="pipeline\document_source_group.cpp" />
+    <ClCompile Include="pipeline\document_source_limit.cpp" />
+    <ClCompile Include="pipeline\document_source_match.cpp" />
+    <ClCompile Include="pipeline\document_source_out.cpp" />
+    <ClCompile Include="pipeline\document_source_project.cpp" />
+    <ClCompile Include="pipeline\document_source_skip.cpp" />
+    <ClCompile Include="pipeline\document_source_sort.cpp" />
+    <ClCompile Include="pipeline\document_source_unwind.cpp" />
+    <ClCompile Include="pipeline\doc_mem_monitor.cpp" />
+    <ClCompile Include="pipeline\expression.cpp" />
+    <ClCompile Include="pipeline\expression_context.cpp" />
+    <ClCompile Include="pipeline\field_path.cpp" />
+    <ClCompile Include="pipeline\value.cpp" />
+    <ClCompile Include="projection.cpp" />
+    <ClCompile Include="queryoptimizercursor.cpp" />
+    <ClCompile Include="querypattern.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="record.cpp" />
+    <ClCompile Include="repl.cpp" />
+    <ClCompile Include="repl\consensus.cpp" />
+    <ClCompile Include="repl\heartbeat.cpp" />
+    <ClCompile Include="repl\manager.cpp" />
+    <ClCompile Include="repl\rs_initialsync.cpp" />
+    <ClCompile Include="repl\rs_initiate.cpp" />
+    <ClCompile Include="repl\rs_rollback.cpp" />
+    <ClCompile Include="repl\rs_sync.cpp" />
+    <ClCompile Include="repl_block.cpp" />
+    <ClCompile Include="restapi.cpp" />
+    <ClCompile Include="..\client\connpool.cpp" />
+    <ClCompile Include="..\client\dbclient.cpp" />
+    <ClCompile Include="..\client\syncclusterconnection.cpp" />
+    <ClCompile Include="..\pch.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="client.cpp" />
+    <ClCompile Include="clientcursor.cpp" />
+    <ClCompile Include="cloner.cpp" />
+    <ClCompile Include="commands.cpp" />
+    <ClCompile Include="common.cpp">
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="cursor.cpp" />
+    <ClCompile Include="database.cpp" />
+    <ClCompile Include="db.cpp" />
+    <ClCompile Include="dbcommands.cpp" />
+    <ClCompile Include="dbcommands_admin.cpp" />
+    <ClCompile Include="dbeval.cpp" />
+    <ClCompile Include="dbhelpers.cpp" />
+    <ClCompile Include="dbwebserver.cpp" />
+    <ClCompile Include="extsort.cpp" />
+    <ClCompile Include="index.cpp" />
+    <ClCompile Include="indexkey.cpp" />
+    <ClCompile Include="instance.cpp" />
+    <ClCompile Include="introspect.cpp" />
+    <ClCompile Include="jsobj.cpp" />
+    <ClCompile Include="json.cpp" />
+    <ClCompile Include="lasterror.cpp" />
+    <ClCompile Include="matcher.cpp" />
+    <ClCompile Include="matcher_covered.cpp" />
+    <ClCompile Include="..\util\mmap_win.cpp" />
+    <ClCompile Include="modules\mms.cpp" />
+    <ClCompile Include="module.cpp" />
+    <ClCompile Include="namespace.cpp" />
+    <ClCompile Include="nonce.cpp" />
+    <ClCompile Include="..\client\parallel.cpp" />
+    <ClCompile Include="pdfile.cpp" />
+    <ClCompile Include="queryoptimizer.cpp" />
+    <ClCompile Include="scanandorder.cpp" />
+    <ClCompile Include="security.cpp" />
+    <ClCompile Include="security_commands.cpp" />
+    <ClCompile Include="security_common.cpp" />
+    <ClCompile Include="tests.cpp" />
+    <ClCompile Include="cmdline.cpp" />
+    <ClCompile Include="queryutil.cpp" />
+    <ClCompile Include="..\util\assert_util.cpp" />
+    <ClCompile Include="..\util\background.cpp" />
+    <ClCompile Include="..\util\base64.cpp" />
+    <ClCompile Include="..\util\mmap.cpp" />
+    <ClCompile Include="..\util\ntservice.cpp" />
+    <ClCompile Include="..\util\processinfo_win32.cpp" />
+    <ClCompile Include="..\util\util.cpp" />
+    <ClCompile Include="..\util\net\httpclient.cpp" />
+    <ClCompile Include="..\util\md5.c">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeader>
+      <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+      </PrecompiledHeaderFile>
+      <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </PrecompiledHeaderFile>
+    </ClCompile>
+    <ClCompile Include="..\util\md5main.cpp">
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Use</PrecompiledHeader>
+      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Use</PrecompiledHeader>
+    </ClCompile>
+    <ClCompile Include="..\util\net\message.cpp" />
+    <ClCompile Include="..\util\net\message_port.cpp" />
+    <ClCompile Include="..\util\net\message_server_port.cpp" />
+    <ClCompile Include="..\util\net\sock.cpp" />
+    <ClCompile Include="..\s\d_logic.cpp" />
+    <ClCompile Include="..\scripting\engine.cpp" />
+    <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
+    <ClCompile Include="..\scripting\utils.cpp" />
+    <ClCompile Include="stats\counters.cpp" />
+    <ClCompile Include="stats\snapshots.cpp" />
+    <ClCompile Include="stats\top.cpp" />
+    <ClCompile Include="btree.cpp" />
+    <ClCompile Include="btreecursor.cpp" />
+    <ClCompile Include="repl\health.cpp" />
+    <ClCompile Include="repl\rs.cpp" />
+    <ClCompile Include="repl\replset_commands.cpp" />
+    <ClCompile Include="repl\rs_config.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\jstests\dur\basic1.sh" />
+    <None Include="..\jstests\dur\dur1.js" />
+    <None Include="..\jstests\replsets\replset1.js" />
+    <None Include="..\jstests\replsets\replset2.js" />
+    <None Include="..\jstests\replsets\replset3.js" />
+    <None Include="..\jstests\replsets\replset4.js" />
+    <None Include="..\jstests\replsets\replset5.js" />
+    <None Include="..\jstests\replsets\replsetadd.js" />
+    <None Include="..\jstests\replsets\replsetarb1.js" />
+    <None Include="..\jstests\replsets\replsetarb2.js" />
+    <None Include="..\jstests\replsets\replsetprio1.js" />
+    <None Include="..\jstests\replsets\replsetrestart1.js" />
+    <None Include="..\jstests\replsets\replsetrestart2.js" />
+    <None Include="..\jstests\replsets\replset_remove_node.js" />
+    <None Include="..\jstests\replsets\rollback.js" />
+    <None Include="..\jstests\replsets\rollback2.js" />
+    <None Include="..\jstests\replsets\sync1.js" />
+    <None Include="..\jstests\replsets\twosets.js" />
+    <None Include="..\SConstruct" />
+    <None Include="..\util\mongoutils\README" />
+    <None Include="mongo.ico" />
+    <None Include="repl\notes.txt" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\bson\bson-inl.h" />
+    <ClInclude Include="..\bson\bson.h" />
+    <ClInclude Include="..\bson\bson_db.h" />
+    <ClInclude Include="..\bson\inline_decls.h" />
+    <ClInclude Include="..\bson\stringdata.h" />
+    <ClInclude Include="..\bson\util\atomic_int.h" />
+    <ClInclude Include="..\bson\util\builder.h" />
+    <ClInclude Include="..\bson\util\misc.h" />
+    <ClInclude Include="..\client\dbclientcursor.h" />
+    <ClInclude Include="..\client\distlock.h" />
+    <ClInclude Include="..\client\gridfs.h" />
+    <ClInclude Include="..\client\parallel.h" />
+    <ClInclude Include="..\s\d_logic.h" />
+    <ClInclude Include="..\targetver.h" />
+    <ClInclude Include="..\third_party\pcre-7.4\config.h" />
+    <ClInclude Include="..\third_party\pcre-7.4\pcre.h" />
+    <ClInclude Include="..\third_party\snappy\config.h" />
+    <ClInclude Include="..\third_party\snappy\snappy.h" />
+    <ClInclude Include="..\util\alignedbuilder.h" />
+    <ClInclude Include="..\util\concurrency\race.h" />
+    <ClInclude Include="..\util\concurrency\rwlock.h" />
+    <ClInclude Include="..\util\concurrency\msg.h" />
+    <ClInclude Include="..\util\concurrency\mutex.h" />
+    <ClInclude Include="..\util\concurrency\mvar.h" />
+    <ClInclude Include="..\util\concurrency\task.h" />
+    <ClInclude Include="..\util\concurrency\thread_pool.h" />
+    <ClInclude Include="..\util\intrusive_counter.h" />
+    <ClInclude Include="..\util\logfile.h" />
+    <ClInclude Include="..\util\mongoutils\checksum.h" />
+    <ClInclude Include="..\util\mongoutils\html.h" />
+    <ClInclude Include="..\util\mongoutils\str.h" />
+    <ClInclude Include="..\util\net\hostandport.h" />
+    <ClInclude Include="..\util\net\listen.h" />
+    <ClInclude Include="..\util\net\message_port.h" />
+    <ClInclude Include="..\util\net\miniwebserver.h" />
+    <ClInclude Include="..\util\paths.h" />
+    <ClInclude Include="..\util\ramlog.h" />
+    <ClInclude Include="..\util\systeminfo.h" />
+    <ClInclude Include="..\util\text.h" />
+    <ClInclude Include="..\util\time_support.h" />
+    <ClInclude Include="databaseholder.h" />
+    <ClInclude Include="durop.h" />
+    <ClInclude Include="dur_commitjob.h" />
+    <ClInclude Include="dur_journal.h" />
+    <ClInclude Include="dur_journalformat.h" />
+    <ClInclude Include="dur_journalimpl.h" />
+    <ClInclude Include="dur_stats.h" />
+    <ClInclude Include="d_globals.h" />
+    <ClInclude Include="geo\core.h" />
+    <ClInclude Include="globals.h" />
+    <ClInclude Include="helpers\dblogger.h" />
+    <ClInclude Include="instance.h" />
+    <ClInclude Include="mongommf.h" />
+    <ClInclude Include="mongomutex.h" />
+    <ClInclude Include="namespace-inl.h" />
+    <ClInclude Include="namespacestring.h" />
+    <ClInclude Include="oplogreader.h" />
+    <ClInclude Include="ops\count.h" />
+    <ClInclude Include="ops\delete.h" />
+    <ClInclude Include="ops\update.h" />
+    <ClInclude Include="pagefault.h" />
+    <ClInclude Include="pipeline\accumulator.h" />
+    <ClInclude Include="pipeline\builder.h" />
+    <ClInclude Include="pipeline\document.h" />
+    <ClInclude Include="pipeline\document_source.h" />
+    <ClInclude Include="pipeline\doc_mem_monitor.h" />
+    <ClInclude Include="pipeline\expression.h" />
+    <ClInclude Include="pipeline\expression_context.h" />
+    <ClInclude Include="pipeline\field_path.h" />
+    <ClInclude Include="pipeline\value.h" />
+    <ClInclude Include="projection.h" />
+    <ClInclude Include="queryutil.h" />
+    <ClInclude Include="repl.h" />
+    <ClInclude Include="replpair.h" />
+    <ClInclude Include="repl\connections.h" />
+    <ClInclude Include="repl\multicmd.h" />
+    <ClInclude Include="repl\rsmember.h" />
+    <ClInclude Include="repl\rs_optime.h" />
+    <ClInclude Include="stats\counters.h" />
+    <ClInclude Include="stats\snapshots.h" />
+    <ClInclude Include="stats\top.h" />
+    <ClInclude Include="..\client\connpool.h" />
+    <ClInclude Include="..\client\dbclient.h" />
+    <ClInclude Include="..\client\model.h" />
+    <ClInclude Include="..\client\redef_macros.h" />
+    <ClInclude Include="..\client\syncclusterconnection.h" />
+    <ClInclude Include="..\client\undef_macros.h" />
+    <ClInclude Include="background.h" />
+    <ClInclude Include="client.h" />
+    <ClInclude Include="clientcursor.h" />
+    <ClInclude Include="cmdline.h" />
+    <ClInclude Include="commands.h" />
+    <ClInclude Include="concurrency.h" />
+    <ClInclude Include="curop.h" />
+    <ClInclude Include="cursor.h" />
+    <ClInclude Include="database.h" />
+    <ClInclude Include="db.h" />
+    <ClInclude Include="dbhelpers.h" />
+    <ClInclude Include="dbinfo.h" />
+    <ClInclude Include="dbmessage.h" />
+    <ClInclude Include="diskloc.h" />
+    <ClInclude Include="index.h" />
+    <ClInclude Include="indexkey.h" />
+    <ClInclude Include="introspect.h" />
+    <ClInclude Include="json.h" />
+    <ClInclude Include="matcher.h" />
+    <ClInclude Include="namespace.h" />
+    <ClInclude Include="..\pch.h" />
+    <ClInclude Include="pdfile.h" />
+    <ClInclude Include="..\grid\protocol.h" />
+    <ClInclude Include="query.h" />
+    <ClInclude Include="queryoptimizer.h" />
+    <ClInclude Include="resource.h" />
+    <ClInclude Include="scanandorder.h" />
+    <ClInclude Include="security.h" />
+    <ClInclude Include="..\util\allocator.h" />
+    <ClInclude Include="..\util\array.h" />
+    <ClInclude Include="..\util\assert_util.h" />
+    <ClInclude Include="..\util\background.h" />
+    <ClInclude Include="..\util\base64.h" />
+    <ClInclude Include="..\util\builder.h" />
+    <ClInclude Include="..\util\debug_util.h" />
+    <ClInclude Include="..\util\embedded_builder.h" />
+    <ClInclude Include="..\util\file.h" />
+    <ClInclude Include="..\util\file_allocator.h" />
+    <ClInclude Include="..\util\goodies.h" />
+    <ClInclude Include="..\util\hashtab.h" />
+    <ClInclude Include="..\util\hex.h" />
+    <ClInclude Include="lasterror.h" />
+    <ClInclude Include="..\util\log.h" />
+    <ClInclude Include="..\util\lruishmap.h" />
+    <ClInclude Include="..\util\mmap.h" />
+    <ClInclude Include="..\util\ntservice.h" />
+    <ClInclude Include="..\util\optime.h" />
+    <ClInclude Include="..\util\processinfo.h" />
+    <ClInclude Include="..\util\queue.h" />
+    <ClInclude Include="..\util\ramstore.h" />
+    <ClInclude Include="..\util\unittest.h" />
+    <ClInclude Include="..\util\concurrency\list.h" />
+    <ClInclude Include="..\util\concurrency\value.h" />
+    <ClInclude Include="..\util\web\html.h" />
+    <ClInclude Include="..\util\net\httpclient.h" />
+    <ClInclude Include="..\util\md5.h" />
+    <ClInclude Include="..\util\md5.hpp" />
+    <ClInclude Include="..\util\net\message.h" />
+    <ClInclude Include="..\util\net\message_server.h" />
+    <ClInclude Include="..\util\net\sock.h" />
+    <ClInclude Include="..\scripting\engine.h" />
+    <ClInclude Include="..\scripting\engine_spidermonkey.h" />
+    <ClInclude Include="..\scripting\engine_v8.h" />
+    <ClInclude Include="..\scripting\v8_db.h" />
+    <ClInclude Include="..\scripting\v8_utils.h" />
+    <ClInclude Include="..\scripting\v8_wrapper.h" />
+    <ClInclude Include="btree.h" />
+    <ClInclude Include="repl\health.h" />
+    <ClInclude Include="repl\rs.h" />
+    <ClInclude Include="repl\rs_config.h" />
+    <ClInclude Include="..\bson\bsonelement.h" />
+    <ClInclude Include="..\bson\bsoninlines.h" />
+    <ClInclude Include="..\bson\bsonmisc.h" />
+    <ClInclude Include="..\bson\bsonobj.h" />
+    <ClInclude Include="..\bson\bsonobjbuilder.h" />
+    <ClInclude Include="..\bson\bsonobjiterator.h" />
+    <ClInclude Include="..\bson\bsontypes.h" />
+    <ClInclude Include="jsobj.h" />
+    <ClInclude Include="..\bson\oid.h" />
+    <ClInclude Include="..\bson\ordering.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <Library Include="..\..\js\js32d.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js32r.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js64d.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+    <Library Include="..\..\js\js64r.lib">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+    </Library>
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="db.rc" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/db/db.vcxproj.filters b/src/mongo/db/db.vcxproj.filters
new file mode 100755
index 00000000000..a39df0dc796
--- /dev/null
+++ b/src/mongo/db/db.vcxproj.filters
@@ -0,0 +1,432 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\bson\oid.cpp" />
+    <ClCompile Include="..\client\dbclientcursor.cpp" />
+    <ClCompile Include="..\client\dbclient_rs.cpp" />
+    <ClCompile Include="..\client\distlock.cpp" />
+    <ClCompile Include="..\client\model.cpp" />
+    <ClCompile Include="..\scripting\bench.cpp" />
+    <ClCompile Include="..\shell\mongo.cpp" />
+    <ClCompile Include="..\s\chunk.cpp" />
+    <ClCompile Include="..\s\config.cpp" />
+    <ClCompile Include="..\s\d_chunk_manager.cpp" />
+    <ClCompile Include="..\s\d_migrate.cpp" />
+    <ClCompile Include="..\s\d_split.cpp" />
+    <ClCompile Include="..\s\d_state.cpp" />
+    <ClCompile Include="..\s\d_writeback.cpp" />
+    <ClCompile Include="..\s\grid.cpp" />
+    <ClCompile Include="..\s\shard.cpp" />
+    <ClCompile Include="..\s\shardconnection.cpp" />
+    <ClCompile Include="..\s\shardkey.cpp" />
+    <ClCompile Include="..\util\alignedbuilder.cpp" />
+    <ClCompile Include="..\util\concurrency\spin_lock.cpp" />
+    <ClCompile Include="..\util\concurrency\synchronization.cpp" />
+    <ClCompile Include="..\util\concurrency\task.cpp" />
+    <ClCompile Include="..\util\concurrency\thread_pool.cpp" />
+    <ClCompile Include="..\util\concurrency\vars.cpp" />
+    <ClCompile Include="..\util\log.cpp" />
+    <ClCompile Include="..\util\logfile.cpp" />
+    <ClCompile Include="..\util\processinfo.cpp" />
+    <ClCompile Include="..\util\stringutils.cpp" />
+    <ClCompile Include="..\util\text.cpp" />
+    <ClCompile Include="..\util\version.cpp" />
+    <ClCompile Include="cap.cpp" />
+    <ClCompile Include="commands\distinct.cpp" />
+    <ClCompile Include="commands\group.cpp" />
+    <ClCompile Include="commands\isself.cpp" />
+    <ClCompile Include="commands\mr.cpp" />
+    <ClCompile Include="compact.cpp" />
+    <ClCompile Include="dbcommands_generic.cpp" />
+    <ClCompile Include="dur.cpp" />
+    <ClCompile Include="durop.cpp" />
+    <ClCompile Include="dur_commitjob.cpp" />
+    <ClCompile Include="dur_journal.cpp" />
+    <ClCompile Include="dur_preplogbuffer.cpp" />
+    <ClCompile Include="dur_recover.cpp" />
+    <ClCompile Include="dur_writetodatafiles.cpp" />
+    <ClCompile Include="geo\2d.cpp" />
+    <ClCompile Include="geo\haystack.cpp" />
+    <ClCompile Include="mongommf.cpp" />
+    <ClCompile Include="oplog.cpp" />
+    <ClCompile Include="projection.cpp" />
+    <ClCompile Include="repl.cpp" />
+    <ClCompile Include="repl\consensus.cpp" />
+    <ClCompile Include="repl\heartbeat.cpp" />
+    <ClCompile Include="repl\manager.cpp" />
+    <ClCompile Include="repl\rs_initialsync.cpp" />
+    <ClCompile Include="repl\rs_initiate.cpp" />
+    <ClCompile Include="repl\rs_rollback.cpp" />
+    <ClCompile Include="repl\rs_sync.cpp" />
+    <ClCompile Include="repl_block.cpp" />
+    <ClCompile Include="restapi.cpp" />
+    <ClCompile Include="..\client\connpool.cpp" />
+    <ClCompile Include="..\client\dbclient.cpp" />
+    <ClCompile Include="..\client\syncclusterconnection.cpp" />
+    <ClCompile Include="..\pch.cpp" />
+    <ClCompile Include="client.cpp" />
+    <ClCompile Include="clientcursor.cpp" />
+    <ClCompile Include="cloner.cpp" />
+    <ClCompile Include="commands.cpp" />
+    <ClCompile Include="common.cpp" />
+    <ClCompile Include="cursor.cpp" />
+    <ClCompile Include="database.cpp" />
+    <ClCompile Include="db.cpp" />
+    <ClCompile Include="dbcommands.cpp" />
+    <ClCompile Include="dbcommands_admin.cpp" />
+    <ClCompile Include="dbeval.cpp" />
+    <ClCompile Include="dbhelpers.cpp" />
+    <ClCompile Include="dbwebserver.cpp" />
+    <ClCompile Include="extsort.cpp" />
+    <ClCompile Include="index.cpp" />
+    <ClCompile Include="indexkey.cpp" />
+    <ClCompile Include="instance.cpp" />
+    <ClCompile Include="introspect.cpp" />
+    <ClCompile Include="jsobj.cpp" />
+    <ClCompile Include="json.cpp" />
+    <ClCompile Include="lasterror.cpp" />
+    <ClCompile Include="matcher.cpp" />
+    <ClCompile Include="matcher_covered.cpp" />
+    <ClCompile Include="..\util\mmap_win.cpp" />
+    <ClCompile Include="modules\mms.cpp" />
+    <ClCompile Include="module.cpp" />
+    <ClCompile Include="namespace.cpp" />
+    <ClCompile Include="nonce.cpp" />
+    <ClCompile Include="..\client\parallel.cpp" />
+    <ClCompile Include="pdfile.cpp" />
+    <ClCompile Include="queryoptimizer.cpp" />
+    <ClCompile Include="security.cpp" />
+    <ClCompile Include="security_commands.cpp" />
+    <ClCompile Include="tests.cpp" />
+    <ClCompile Include="cmdline.cpp" />
+    <ClCompile Include="queryutil.cpp" />
+    <ClCompile Include="..\util\assert_util.cpp" />
+    <ClCompile Include="..\util\background.cpp" />
+    <ClCompile Include="..\util\base64.cpp" />
+    <ClCompile Include="..\util\mmap.cpp" />
+    <ClCompile Include="..\util\ntservice.cpp" />
+    <ClCompile Include="..\util\processinfo_win32.cpp" />
+    <ClCompile Include="..\util\util.cpp" />
+    <ClCompile Include="..\util\md5.c" />
+    <ClCompile Include="..\util\md5main.cpp" />
+    <ClCompile Include="..\s\d_logic.cpp" />
+    <ClCompile Include="..\scripting\engine.cpp" />
+    <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
+    <ClCompile Include="..\scripting\utils.cpp" />
+    <ClCompile Include="stats\counters.cpp" />
+    <ClCompile Include="stats\snapshots.cpp" />
+    <ClCompile Include="stats\top.cpp" />
+    <ClCompile Include="btree.cpp" />
+    <ClCompile Include="btreecursor.cpp" />
+    <ClCompile Include="repl\health.cpp" />
+    <ClCompile Include="repl\rs.cpp" />
+    <ClCompile Include="repl\replset_commands.cpp" />
+    <ClCompile Include="repl\rs_config.cpp" />
+    <ClCompile Include="..\util\file_allocator.cpp" />
+    <ClCompile Include="querypattern.cpp" />
+    <ClCompile Include="..\util\ramlog.cpp" />
+    <ClCompile Include="key.cpp" />
+    <ClCompile Include="btreebuilder.cpp" />
+    <ClCompile Include="queryoptimizercursor.cpp" />
+    <ClCompile Include="record.cpp" />
+    <ClCompile Include="ops\delete.cpp" />
+    <ClCompile Include="ops\update.cpp" />
+    <ClCompile Include="security_common.cpp" />
+    <ClCompile Include="ops\query.cpp" />
+    <ClCompile Include="..\util\net\httpclient.cpp" />
+    <ClCompile Include="..\util\net\message.cpp" />
+    <ClCompile Include="..\util\net\message_server_port.cpp" />
+    <ClCompile Include="..\util\net\sock.cpp" />
+    <ClCompile Include="..\util\net\miniwebserver.cpp" />
+    <ClCompile Include="..\util\net\listen.cpp" />
+    <ClCompile Include="..\util\net\message_port.cpp" />
+    <ClCompile Include="dbmessage.cpp" />
+    <ClCompile Include="commands\find_and_modify.cpp" />
+    <ClCompile Include="..\util\compress.cpp">
+      <Filter>snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+      <Filter>snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="..\third_party\snappy\snappy.cc">
+      <Filter>snappy</Filter>
+    </ClCompile>
+    <ClCompile Include="scanandorder.cpp" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcrecpp.cc" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_chartables.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_compile.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_config.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_dfa_exec.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_exec.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_fullinfo.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_get.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_globals.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_info.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_maketables.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_newline.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ord2utf8.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_refcount.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_scanner.cc" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_stringpiece.cc" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_study.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_tables.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_try_flipped.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_ucp_searchfuncs.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_valid_utf8.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_version.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcre_xclass.c" />
+    <ClCompile Include="..\third_party\pcre-7.4\pcreposix.c" />
+    <ClCompile Include="commands\cloud.cpp" />
+    <ClCompile Include="commands\pipeline_command.cpp" />
+    <ClCompile Include="commands\pipeline.cpp" />
+    <ClCompile Include="pipeline\accumulator.cpp" />
+    <ClCompile Include="pipeline\accumulator_add_to_set.cpp" />
+    <ClCompile Include="pipeline\accumulator_avg.cpp" />
+    <ClCompile Include="pipeline\accumulator_first.cpp" />
+    <ClCompile Include="pipeline\accumulator_last.cpp" />
+    <ClCompile Include="pipeline\accumulator_min_max.cpp" />
+    <ClCompile Include="pipeline\accumulator_push.cpp" />
+    <ClCompile Include="pipeline\accumulator_single_value.cpp" />
+    <ClCompile Include="pipeline\accumulator_sum.cpp" />
+    <ClCompile Include="pipeline\builder.cpp" />
+    <ClCompile Include="pipeline\doc_mem_monitor.cpp" />
+    <ClCompile Include="pipeline\document.cpp" />
+    <ClCompile Include="pipeline\document_source.cpp" />
+    <ClCompile Include="pipeline\document_source_bson_array.cpp" />
+    <ClCompile Include="pipeline\document_source_command_futures.cpp" />
+    <ClCompile Include="pipeline\document_source_filter.cpp" />
+    <ClCompile Include="pipeline\document_source_filter_base.cpp" />
+    <ClCompile Include="pipeline\document_source_group.cpp" />
+    <ClCompile Include="pipeline\document_source_limit.cpp" />
+    <ClCompile Include="pipeline\document_source_match.cpp" />
+    <ClCompile Include="pipeline\document_source_out.cpp" />
+    <ClCompile Include="pipeline\document_source_project.cpp" />
+    <ClCompile Include="pipeline\document_source_skip.cpp" />
+    <ClCompile Include="pipeline\document_source_sort.cpp" />
+    <ClCompile Include="pipeline\document_source_unwind.cpp" />
+    <ClCompile Include="pipeline\expression.cpp" />
+    <ClCompile Include="pipeline\expression_context.cpp" />
+    <ClCompile Include="pipeline\field_path.cpp" />
+    <ClCompile Include="pipeline\value.cpp" />
+    <ClCompile Include="..\util\intrusive_counter.cpp" />
+    <ClCompile Include="..\util\systeminfo_win32.cpp" />
+    <ClCompile Include="commands\document_source_cursor.cpp" />
+    <ClCompile Include="d_concurrency.cpp" />
+    <ClCompile Include="..\s\default_version.cpp" />
+    <ClCompile Include="ops\count.cpp" />
+    <ClCompile Include="pagefault.cpp" />
+    <ClCompile Include="d_globals.cpp" />
+    <ClCompile Include="curop.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\client\dbclientcursor.h" />
+    <ClInclude Include="..\client\distlock.h" />
+    <ClInclude Include="..\client\gridfs.h" />
+    <ClInclude Include="..\client\parallel.h" />
+    <ClInclude Include="..\s\d_logic.h" />
+    <ClInclude Include="..\targetver.h" />
+    <ClInclude Include="..\util\concurrency\rwlock.h" />
+    <ClInclude Include="..\util\concurrency\msg.h" />
+    <ClInclude Include="..\util\concurrency\mutex.h" />
+    <ClInclude Include="..\util\concurrency\mvar.h" />
+    <ClInclude Include="..\util\concurrency\task.h" />
+    <ClInclude Include="..\util\concurrency\thread_pool.h" />
+    <ClInclude Include="..\util\logfile.h" />
+    <ClInclude Include="..\util\mongoutils\checksum.h" />
+    <ClInclude Include="..\util\mongoutils\html.h" />
+    <ClInclude Include="..\util\mongoutils\str.h" />
+    <ClInclude Include="..\util\paths.h" />
+    <ClInclude Include="..\util\ramlog.h" />
+    <ClInclude Include="..\util\text.h" />
+    <ClInclude Include="..\util\time_support.h" />
+    <ClInclude Include="durop.h" />
+    <ClInclude Include="dur_commitjob.h" />
+    <ClInclude Include="dur_journal.h" />
+    <ClInclude Include="dur_journalformat.h" />
+    <ClInclude Include="dur_stats.h" />
+    <ClInclude Include="geo\core.h" />
+    <ClInclude Include="helpers\dblogger.h" />
+    <ClInclude Include="instance.h" />
+    <ClInclude Include="mongommf.h" />
+    <ClInclude Include="mongomutex.h" />
+    <ClInclude Include="namespace-inl.h" />
+    <ClInclude Include="oplogreader.h" />
+    <ClInclude Include="projection.h" />
+    <ClInclude Include="repl.h" />
+    <ClInclude Include="replpair.h" />
+    <ClInclude Include="repl\connections.h" />
+    <ClInclude Include="repl\multicmd.h" />
+    <ClInclude Include="repl\rsmember.h" />
+    <ClInclude Include="repl\rs_optime.h" />
+    <ClInclude Include="stats\counters.h" />
+    <ClInclude Include="stats\snapshots.h" />
+    <ClInclude Include="stats\top.h" />
+    <ClInclude Include="..\client\connpool.h" />
+    <ClInclude Include="..\client\dbclient.h" />
+    <ClInclude Include="..\client\model.h" />
+    <ClInclude Include="..\client\redef_macros.h" />
+    <ClInclude Include="..\client\syncclusterconnection.h" />
+    <ClInclude Include="..\client\undef_macros.h" />
+    <ClInclude Include="background.h" />
+    <ClInclude Include="client.h" />
+    <ClInclude Include="clientcursor.h" />
+    <ClInclude Include="cmdline.h" />
+    <ClInclude Include="commands.h" />
+    <ClInclude Include="concurrency.h" />
+    <ClInclude Include="curop.h" />
+    <ClInclude Include="cursor.h" />
+    <ClInclude Include="database.h" />
+    <ClInclude Include="db.h" />
+    <ClInclude Include="dbhelpers.h" />
+    <ClInclude Include="dbinfo.h" />
+    <ClInclude Include="dbmessage.h" />
+    <ClInclude Include="diskloc.h" />
+    <ClInclude Include="index.h" />
+    <ClInclude Include="indexkey.h" />
+    <ClInclude Include="introspect.h" />
+    <ClInclude Include="json.h" />
+    <ClInclude Include="matcher.h" />
+    <ClInclude Include="namespace.h" />
+    <ClInclude Include="..\pch.h" />
+    <ClInclude Include="pdfile.h" />
+    <ClInclude Include="..\grid\protocol.h" />
+    <ClInclude Include="query.h" />
+    <ClInclude Include="queryoptimizer.h" />
+    <ClInclude Include="resource.h" />
+    <ClInclude Include="scanandorder.h" />
+    <ClInclude Include="security.h" />
+    <ClInclude Include="..\util\allocator.h" />
+    <ClInclude Include="..\util\array.h" />
+    <ClInclude Include="..\util\assert_util.h" />
+    <ClInclude Include="..\util\background.h" />
+    <ClInclude Include="..\util\base64.h" />
+    <ClInclude Include="..\util\builder.h" />
+    <ClInclude Include="..\util\debug_util.h" />
+    <ClInclude Include="..\util\embedded_builder.h" />
+    <ClInclude Include="..\util\file.h" />
+    <ClInclude Include="..\util\file_allocator.h" />
+    <ClInclude Include="..\util\goodies.h" />
+    <ClInclude Include="..\util\hashtab.h" />
+    <ClInclude Include="..\util\hex.h" />
+    <ClInclude Include="lasterror.h" />
+    <ClInclude Include="..\util\log.h" />
+    <ClInclude Include="..\util\lruishmap.h" />
+    <ClInclude Include="..\util\mmap.h" />
+    <ClInclude Include="..\util\ntservice.h" />
+    <ClInclude Include="..\util\optime.h" />
+    <ClInclude Include="..\util\processinfo.h" />
+    <ClInclude Include="..\util\queue.h" />
+    <ClInclude Include="..\util\ramstore.h" />
+    <ClInclude Include="..\util\unittest.h" />
+    <ClInclude Include="..\util\concurrency\list.h" />
+    <ClInclude Include="..\util\concurrency\value.h" />
+    <ClInclude Include="..\util\web\html.h" />
+    <ClInclude Include="..\util\md5.h" />
+    <ClInclude Include="..\util\md5.hpp" />
+    <ClInclude Include="..\scripting\engine.h" />
+    <ClInclude Include="..\scripting\engine_spidermonkey.h" />
+    <ClInclude Include="..\scripting\engine_v8.h" />
+    <ClInclude Include="..\scripting\v8_db.h" />
+    <ClInclude Include="..\scripting\v8_utils.h" />
+    <ClInclude Include="..\scripting\v8_wrapper.h" />
+    <ClInclude Include="btree.h" />
+    <ClInclude Include="repl\health.h" />
+    <ClInclude Include="repl\rs.h" />
+    <ClInclude Include="repl\rs_config.h" />
+    <ClInclude Include="..\bson\bsonelement.h" />
+    <ClInclude Include="..\bson\bsoninlines.h" />
+    <ClInclude Include="..\bson\bsonmisc.h" />
+    <ClInclude Include="..\bson\bsonobj.h" />
+    <ClInclude Include="..\bson\bsonobjbuilder.h" />
+    <ClInclude Include="..\bson\bsonobjiterator.h" />
+    <ClInclude Include="..\bson\bsontypes.h" />
+    <ClInclude Include="jsobj.h" />
+    <ClInclude Include="..\bson\oid.h" />
+    <ClInclude Include="..\bson\ordering.h" />
+    <ClInclude Include="dur_journalimpl.h" />
+    <ClInclude Include="..\util\concurrency\race.h" />
+    <ClInclude Include="..\util\alignedbuilder.h" />
+    <ClInclude Include="queryutil.h" />
+    <ClInclude Include="..\bson\bson.h" />
+    <ClInclude Include="..\bson\bson_db.h" />
+    <ClInclude Include="..\bson\bson-inl.h" />
+    <ClInclude Include="..\bson\inline_decls.h" />
+    <ClInclude Include="..\bson\stringdata.h" />
+    <ClInclude Include="..\bson\util\atomic_int.h" />
+    <ClInclude Include="..\bson\util\builder.h" />
+    <ClInclude Include="..\bson\util\misc.h" />
+    <ClInclude Include="ops\delete.h" />
+    <ClInclude Include="ops\update.h" />
+    <ClInclude Include="..\util\net\httpclient.h" />
+    <ClInclude Include="..\util\net\message.h" />
+    <ClInclude Include="..\util\net\message_server.h" />
+    <ClInclude Include="..\util\net\sock.h" />
+    <ClInclude Include="..\third_party\snappy\config.h">
+      <Filter>snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\snappy\snappy.h">
+      <Filter>snappy</Filter>
+    </ClInclude>
+    <ClInclude Include="..\third_party\pcre-7.4\config.h" />
+    <ClInclude Include="..\third_party\pcre-7.4\pcre.h" />
+    <ClInclude Include="globals.h" />
+    <ClInclude Include="..\util\net\hostandport.h" />
+    <ClInclude Include="..\util\net\listen.h" />
+    <ClInclude Include="..\util\net\message_port.h" />
+    <ClInclude Include="..\util\net\miniwebserver.h" />
+    <ClInclude Include="databaseholder.h" />
+    <ClInclude Include="pipeline\accumulator.h" />
+    <ClInclude Include="pipeline\builder.h" />
+    <ClInclude Include="pipeline\doc_mem_monitor.h" />
+    <ClInclude Include="pipeline\document.h" />
+    <ClInclude Include="pipeline\document_source.h" />
+    <ClInclude Include="pipeline\expression.h" />
+    <ClInclude Include="pipeline\expression_context.h" />
+    <ClInclude Include="pipeline\field_path.h" />
+    <ClInclude Include="pipeline\value.h" />
+    <ClInclude Include="..\util\intrusive_counter.h" />
+    <ClInclude Include="..\util\systeminfo.h" />
+    <ClInclude Include="namespacestring.h" />
+    <ClInclude Include="ops\count.h" />
+    <ClInclude Include="pagefault.h" />
+    <ClInclude Include="d_globals.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="db.rc" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\jstests\dur\basic1.sh" />
+    <None Include="..\jstests\dur\dur1.js" />
+    <None Include="..\jstests\replsets\replset1.js" />
+    <None Include="..\jstests\replsets\replset2.js" />
+    <None Include="..\jstests\replsets\replset3.js" />
+    <None Include="..\jstests\replsets\replset4.js" />
+    <None Include="..\jstests\replsets\replset5.js" />
+    <None Include="..\jstests\replsets\replsetadd.js" />
+    <None Include="..\jstests\replsets\replsetarb1.js" />
+    <None Include="..\jstests\replsets\replsetarb2.js" />
+    <None Include="..\jstests\replsets\replsetprio1.js" />
+    <None Include="..\jstests\replsets\replsetrestart1.js" />
+    <None Include="..\jstests\replsets\replsetrestart2.js" />
+    <None Include="..\jstests\replsets\replset_remove_node.js" />
+    <None Include="..\jstests\replsets\rollback.js" />
+    <None Include="..\jstests\replsets\rollback2.js" />
+    <None Include="..\jstests\replsets\sync1.js" />
+    <None Include="..\jstests\replsets\twosets.js" />
+    <None Include="..\SConstruct" />
+    <None Include="..\util\mongoutils\README" />
+    <None Include="mongo.ico" />
+    <None Include="repl\notes.txt" />
+  </ItemGroup>
+  <ItemGroup>
+    <Library Include="..\..\js\js32d.lib" />
+    <Library Include="..\..\js\js32r.lib" />
+    <Library Include="..\..\js\js64d.lib" />
+    <Library Include="..\..\js\js64r.lib" />
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="snappy">
+      <UniqueIdentifier>{bb99c086-7926-4f50-838d-f5f0c18397c0}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
+\ No newline at end of file
diff --git a/src/mongo/db/db_10.sln b/src/mongo/db/db_10.sln
new file mode 100755
index 00000000000..c1d83f3901a
--- /dev/null
+++ b/src/mongo/db/db_10.sln
@@ -0,0 +1,168 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "examples", "examples", "{4082881B-EB00-486F-906C-843B8EC06E18}"
+	ProjectSection(SolutionItems) = preProject
+		driverHelpers.cpp = driverHelpers.cpp
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{2B262D59-9DC7-4BF1-A431-1BD4966899A5}"
+	ProjectSection(SolutionItems) = preProject
+		..\shell\msvc\createCPPfromJavaScriptFiles.js = ..\shell\msvc\createCPPfromJavaScriptFiles.js
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "unix files", "unix files", "{2F760952-C71B-4865-998F-AABAE96D1373}"
+	ProjectSection(SolutionItems) = preProject
+		..\util\processinfo_darwin.cpp = ..\util\processinfo_darwin.cpp
+		..\util\processinfo_linux2.cpp = ..\util\processinfo_linux2.cpp
+		..\util\processinfo_none.cpp = ..\util\processinfo_none.cpp
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "other", "other", "{12B11474-2D74-48C3-BB3D-F03249BEA88F}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongod", "db.vcxproj", "{215B2D68-0A70-4D10-8E75-B31010C62A91}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongos", "..\s\dbgrid.vcxproj", "{E03717ED-69B4-4D21-BC55-DF6690B585C6}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "..\dbtests\test.vcxproj", "{215B2D68-0A70-4D10-8E75-B33010C62A91}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bsondemo", "..\bson\bsondemo\bsondemo.vcxproj", "{C9DB5EB7-81AA-4185-BAA1-DA035654402F}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongoutils test program", "..\util\mongoutils\mongoutils.vcxproj", "{7B84584E-92BC-4DB9-971B-A1A8F93E5053}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simple_client_demo", "..\client\examples\simple_client_demo.vcxproj", "{89C30BC3-2874-4F2C-B4DA-EB04E9782236}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongo", "..\shell\msvc\mongo.vcxproj", "{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongoperf", "..\client\examples\mongoperf.vcxproj", "{79D4E297-BFB7-4FF2-9B13-08A146582E46}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Debug|Mixed Platforms = Debug|Mixed Platforms
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|Any CPU = Release|Any CPU
+		Release|Mixed Platforms = Release|Mixed Platforms
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Mixed Platforms.Build.0 = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.Build.0 = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|x64.ActiveCfg = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|x64.Build.0 = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Any CPU.ActiveCfg = Release|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Mixed Platforms.ActiveCfg = Release|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Mixed Platforms.Build.0 = Release|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.ActiveCfg = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.Build.0 = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|x64.ActiveCfg = Release|x64
+		{215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|x64.Build.0 = Release|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Mixed Platforms.Build.0 = Debug|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.Build.0 = Debug|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|x64.ActiveCfg = Debug|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|x64.Build.0 = Debug|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Any CPU.ActiveCfg = Release|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Mixed Platforms.ActiveCfg = Release|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Mixed Platforms.Build.0 = Release|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.ActiveCfg = Release|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.Build.0 = Release|Win32
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|x64.ActiveCfg = Release|x64
+		{E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|x64.Build.0 = Release|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Mixed Platforms.Build.0 = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.Build.0 = Debug|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|x64.ActiveCfg = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|x64.Build.0 = Debug|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Any CPU.ActiveCfg = Release|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Mixed Platforms.ActiveCfg = Release|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Mixed Platforms.Build.0 = Release|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.ActiveCfg = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.Build.0 = Release|Win32
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|x64.ActiveCfg = Release|x64
+		{215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|x64.Build.0 = Release|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Mixed Platforms.Build.0 = Debug|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Win32.ActiveCfg = Debug|Win32
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Win32.Build.0 = Debug|Win32
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|x64.ActiveCfg = Debug|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|x64.Build.0 = Debug|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Any CPU.ActiveCfg = Release|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Mixed Platforms.ActiveCfg = Release|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Mixed Platforms.Build.0 = Release|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Win32.ActiveCfg = Release|Win32
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Win32.Build.0 = Release|Win32
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|x64.ActiveCfg = Release|x64
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|x64.Build.0 = Release|x64
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Any CPU.ActiveCfg = Debug|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Win32.ActiveCfg = Debug|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Win32.Build.0 = Debug|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|x64.ActiveCfg = Debug|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Any CPU.ActiveCfg = Release|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Mixed Platforms.Build.0 = Release|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Win32.ActiveCfg = Release|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Win32.Build.0 = Release|Win32
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|x64.ActiveCfg = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Any CPU.ActiveCfg = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Win32.ActiveCfg = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Win32.Build.0 = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|x64.ActiveCfg = Debug|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Any CPU.ActiveCfg = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Mixed Platforms.Build.0 = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Win32.ActiveCfg = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Win32.Build.0 = Release|Win32
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|x64.ActiveCfg = Release|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Any CPU.ActiveCfg = Debug|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Win32.ActiveCfg = Debug|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Win32.Build.0 = Debug|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|x64.ActiveCfg = Debug|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Any CPU.ActiveCfg = Release|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Mixed Platforms.Build.0 = Release|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Win32.ActiveCfg = Release|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Win32.Build.0 = Release|Win32
+		{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|x64.ActiveCfg = Release|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Any CPU.ActiveCfg = Debug|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Win32.ActiveCfg = Debug|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Win32.Build.0 = Debug|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|x64.ActiveCfg = Debug|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Any CPU.ActiveCfg = Release|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Mixed Platforms.Build.0 = Release|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Win32.ActiveCfg = Release|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Win32.Build.0 = Release|Win32
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|x64.ActiveCfg = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(NestedProjects) = preSolution
+		{2B262D59-9DC7-4BF1-A431-1BD4966899A5} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+		{2F760952-C71B-4865-998F-AABAE96D1373} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+		{4082881B-EB00-486F-906C-843B8EC06E18} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+		{C9DB5EB7-81AA-4185-BAA1-DA035654402F} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+		{7B84584E-92BC-4DB9-971B-A1A8F93E5053} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+		{89C30BC3-2874-4F2C-B4DA-EB04E9782236} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+		{79D4E297-BFB7-4FF2-9B13-08A146582E46} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+	EndGlobalSection
+EndGlobal
diff --git a/src/mongo/db/dbcommands.cpp b/src/mongo/db/dbcommands.cpp
new file mode 100644
index 00000000000..570c897fae4
--- /dev/null
+++ b/src/mongo/db/dbcommands.cpp
@@ -0,0 +1,1955 @@
+// dbcommands.cpp
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* SHARDING: 
+   I believe this file is for mongod only.
+   See s/commnands_public.cpp for mongos.
+*/
+
+#include "pch.h"
+#include "ops/count.h"
+#include "ops/query.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "../bson/util/builder.h"
+#include <time.h>
+#include "introspect.h"
+#include "btree.h"
+#include "../util/lruishmap.h"
+#include "../util/md5.hpp"
+#include "../util/processinfo.h"
+#include "../util/ramlog.h"
+#include "json.h"
+#include "repl.h"
+#include "repl_block.h"
+#include "replutil.h"
+#include "commands.h"
+#include "db.h"
+#include "instance.h"
+#include "lasterror.h"
+#include "security.h"
+#include "queryoptimizer.h"
+#include "../scripting/engine.h"
+#include "stats/counters.h"
+#include "background.h"
+#include "../util/version.h"
+#include "../s/d_writeback.h"
+#include "dur_stats.h"
+
+namespace mongo {
+
+    namespace dur { 
+        void setAgeOutJournalFiles(bool rotate);
+    }
+    /** @return true if fields found */
+    bool setParmsMongodSpecific(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { 
+        BSONElement e = cmdObj["ageOutJournalFiles"];
+        if( !e.eoo() ) {
+            bool r = e.trueValue();
+            log() << "ageOutJournalFiles " << r << endl;
+            dur::setAgeOutJournalFiles(r);
+            return true;
+        }
+        return false;
+    }
+
+    /* reset any errors so that getlasterror comes back clean.
+
+       useful before performing a long series of operations where we want to
+       see if any of the operations triggered an error, but don't want to check
+       after each op as that woudl be a client/server turnaround.
+    */
+    class CmdResetError : public Command {
+    public:
+        virtual LockType locktype() const { return NONE; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "reset error state (used with getpreverror)";
+        }
+        CmdResetError() : Command("resetError", false, "reseterror") {}
+        bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            LastError *le = lastError.get();
+            assert( le );
+            le->reset();
+            return true;
+        }
+    } cmdResetError;
+
+    /* set by replica sets if specified in the configuration.
+       a pointer is used to avoid any possible locking issues with lockless reading (see below locktype() is NONE
+       and would like to keep that)
+       (for now, it simply orphans any old copy as config changes should be extremely rare).
+       note: once non-null, never goes to null again.
+    */
+    BSONObj *getLastErrorDefault = 0;
+
+    class CmdGetLastError : public Command {
+    public:
+        CmdGetLastError() : Command("getLastError", false, "getlasterror") { }
+        virtual LockType locktype() const { return NONE;  }
+        virtual bool logTheOp()           { return false; }
+        virtual bool slaveOk() const      { return true;  }
+        virtual void help( stringstream& help ) const {
+            help << "return error status of the last operation on this connection\n"
+                 << "options:\n"
+                 << "  { fsync:true } - fsync before returning, or wait for journal commit if running with --journal\n"
+                 << "  { j:true } - wait for journal commit if running with --journal\n"
+                 << "  { w:n } - await replication to n servers (including self) before returning\n"
+                 << "  { wtimeout:m} - timeout for w in m milliseconds";
+        }
+        bool run(const string& dbname, BSONObj& _cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            LastError *le = lastError.disableForCommand();
+
+            bool err = false;
+
+            if ( le->nPrev != 1 )
+                err = LastError::noError.appendSelf( result , false );
+            else
+                err = le->appendSelf( result , false );
+
+            Client& c = cc();
+            c.appendLastOp( result );
+
+            result.appendNumber( "connectionId" , c.getConnectionId() ); // for sharding; also useful in general for debugging
+
+            BSONObj cmdObj = _cmdObj;
+            {
+                BSONObj::iterator i(_cmdObj);
+                i.next();
+                if( !i.more() ) {
+                    /* empty, use default */
+                    BSONObj *def = getLastErrorDefault;
+                    if( def )
+                        cmdObj = *def;
+                }
+            }
+
+            if ( cmdObj["j"].trueValue() ) { 
+                if( !getDur().awaitCommit() ) {
+                    // --journal is off
+                    result.append("jnote", "journaling not enabled on this server");
+                }
+                if( cmdObj["fsync"].trueValue() ) { 
+                    errmsg = "fsync and j options are not used together";
+                    return false;
+                }
+            }
+            else if ( cmdObj["fsync"].trueValue() ) {
+                Timer t;
+                if( !getDur().awaitCommit() ) {
+                    // if get here, not running with --journal
+                    log() << "fsync from getlasterror" << endl;
+                    result.append( "fsyncFiles" , MemoryMappedFile::flushAll( true ) );
+                }
+                else {
+                    // this perhaps is temp.  how long we wait for the group commit to occur.
+                    result.append( "waited", t.millis() );
+                }
+            }
+
+            if ( err ) {
+                // doesn't make sense to wait for replication
+                // if there was an error
+                return true;
+            }
+
+            BSONElement e = cmdObj["w"];
+            if ( e.ok() ) {
+                int timeout = cmdObj["wtimeout"].numberInt();
+                Timer t;
+
+                long long passes = 0;
+                char buf[32];
+                while ( 1 ) {
+                    OpTime op(c.getLastOp());
+                    
+                    if ( op.isNull() ) {
+                        if ( anyReplEnabled() ) {
+                            result.append( "wnote" , "no write has been done on this connection" );
+                        }
+                        else if ( e.isNumber() && e.numberInt() <= 1 ) {
+                            // don't do anything
+                            // w=1 and no repl, so this is fine
+                        }
+                        else {
+                            // w=2 and no repl
+                            result.append( "wnote" , "no replication has been enabled, so w=2+ won't work" );
+                            result.append( "err", "norepl" );
+                            return true; 
+                        }
+                        break;
+                    }
+
+                    // check this first for w=0 or w=1
+                    if ( opReplicatedEnough( op, e ) ) {
+                        break;
+                    }
+
+                    // if replication isn't enabled (e.g., config servers)
+                    if ( ! anyReplEnabled() ) {
+                        result.append( "err", "norepl" );
+                        return true;
+                    }
+
+
+                    if ( timeout > 0 && t.millis() >= timeout ) {
+                        result.append( "wtimeout" , true );
+                        errmsg = "timed out waiting for slaves";
+                        result.append( "waited" , t.millis() );
+                        result.append( "err" , "timeout" );
+                        return true;
+                    }
+
+                    assert( sprintf( buf , "w block pass: %lld" , ++passes ) < 30 );
+                    c.curop()->setMessage( buf );
+                    sleepmillis(1);
+                    killCurrentOp.checkForInterrupt();
+                }
+                result.appendNumber( "wtime" , t.millis() );
+            }
+
+            result.appendNull( "err" );
+            return true;
+        }
+    } cmdGetLastError;
+
+    class CmdGetPrevError : public Command {
+    public:
+        virtual LockType locktype() const { return NONE; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "check for errors since last reseterror commandcal";
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        CmdGetPrevError() : Command("getPrevError", false, "getpreverror") {}
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            LastError *le = lastError.disableForCommand();
+            le->appendSelf( result );
+            if ( le->valid )
+                result.append( "nPrev", le->nPrev );
+            else
+                result.append( "nPrev", -1 );
+            return true;
+        }
+    } cmdGetPrevError;
+
+    CmdShutdown cmdShutdown;
+
+    void CmdShutdown::help( stringstream& help ) const {
+        help << "shutdown the database.  must be ran against admin db and "
+             << "either (1) ran from localhost or (2) authenticated. If "
+             << "this is a primary in a replica set and there is no member "
+             << "within 10 seconds of its optime, it will not shutdown "
+             << "without force : true.  You can also specify timeoutSecs : "
+             << "N to wait N seconds for other members to catch up.";
+    }
+
+    bool CmdShutdown::run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue();
+
+        if (!force && theReplSet && theReplSet->isPrimary()) {
+            long long timeout, now, start;
+            timeout = now = start = curTimeMicros64()/1000000;
+            if (cmdObj.hasField("timeoutSecs")) {
+                timeout += cmdObj["timeoutSecs"].numberLong();
+            }
+
+            OpTime lastOp = theReplSet->lastOpTimeWritten;
+            OpTime closest = theReplSet->lastOtherOpTime();
+            long long int diff = lastOp.getSecs() - closest.getSecs();
+            while (now <= timeout && (diff < 0 || diff > 10)) {
+                sleepsecs(1);
+                now++;
+
+                lastOp = theReplSet->lastOpTimeWritten;
+                closest = theReplSet->lastOtherOpTime();
+                diff = lastOp.getSecs() - closest.getSecs();
+            }
+
+            if (diff < 0 || diff > 10) {
+                errmsg = "no secondaries within 10 seconds of my optime";
+                result.append("closest", closest.getSecs());
+                result.append("difference", diff);
+                return false;
+            }
+
+            // step down
+            theReplSet->stepDown(120);
+
+            log() << "waiting for secondaries to catch up" << endl;
+
+            lastOp = theReplSet->lastOpTimeWritten;
+            while (lastOp != closest && now - start < 60) {
+                closest = theReplSet->lastOtherOpTime();
+
+                now++;
+                sleepsecs(1);
+            }
+
+            // regardless of whether they caught up, we'll shut down
+        }
+
+        return shutdownHelper();
+    }
+
+    class CmdDropDatabase : public Command {
+    public:
+        virtual bool logTheOp() {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "drop (delete) this database";
+        }
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual LockType locktype() const { return WRITE; }
+        CmdDropDatabase() : Command("dropDatabase") {}
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            BSONElement e = cmdObj.firstElement();
+            log() << "dropDatabase " << dbname << endl;
+            int p = (int) e.number();
+            if ( p != 1 )
+                return false;
+            dropDatabase(dbname);
+            result.append( "dropped" , dbname );
+            return true;
+        }
+    } cmdDropDatabase;
+
+    class CmdRepairDatabase : public Command {
+    public:
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual bool maintenanceMode() const { return true; }
+        virtual void help( stringstream& help ) const {
+            help << "repair database.  also compacts. note: slow.";
+        }
+        virtual LockType locktype() const { return WRITE; }
+        CmdRepairDatabase() : Command("repairDatabase") {}
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            BSONElement e = cmdObj.firstElement();
+            log() << "repairDatabase " << dbname << endl;
+            int p = (int) e.number();
+            if ( p != 1 ) {
+                errmsg = "bad option";
+                return false;
+            }
+            e = cmdObj.getField( "preserveClonedFilesOnFailure" );
+            bool preserveClonedFilesOnFailure = e.isBoolean() && e.boolean();
+            e = cmdObj.getField( "backupOriginalFiles" );
+            bool backupOriginalFiles = e.isBoolean() && e.boolean();
+            return repairDatabase( dbname, errmsg, preserveClonedFilesOnFailure, backupOriginalFiles );
+        }
+    } cmdRepairDatabase;
+
+    /* set db profiling level
+       todo: how do we handle profiling information put in the db with replication?
+             sensibly or not?
+    */
+    class CmdProfile : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << "enable or disable performance profiling\n";
+            help << "{ profile : <n> }\n";
+            help << "0=off 1=log slow ops 2=log all\n";
+            help << "-1 to get current values\n";
+            help << "http://www.mongodb.org/display/DOCS/Database+Profiler";
+        }
+        virtual LockType locktype() const { return WRITE; }
+        CmdProfile() : Command("profile") {}
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            BSONElement e = cmdObj.firstElement();
+            result.append("was", cc().database()->profile);
+            result.append("slowms", cmdLine.slowMS );
+
+            int p = (int) e.number();
+            bool ok = false;
+
+            if ( p == -1 )
+                ok = true;
+            else if ( p >= 0 && p <= 2 ) {
+                ok = cc().database()->setProfilingLevel( p , errmsg );
+            }
+
+            BSONElement slow = cmdObj["slowms"];
+            if ( slow.isNumber() )
+                cmdLine.slowMS = slow.numberInt();
+
+            return ok;
+        }
+    } cmdProfile;
+
+    class CmdServerStatus : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return true;
+        }
+        CmdServerStatus() : Command("serverStatus", true) {}
+
+        virtual LockType locktype() const { return NONE; }
+
+        virtual void help( stringstream& help ) const {
+            help << "returns lots of administrative server statistics";
+        }
+
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            long long start = Listener::getElapsedTimeMillis();
+            BSONObjBuilder timeBuilder(128);
+
+
+            bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin");
+
+            result.append( "host" , prettyHostName() );
+            result.append("version", versionString);
+            result.append("process","mongod");
+            result.append("uptime",(double) (time(0)-cmdLine.started));
+            result.append("uptimeEstimate",(double) (start/1000));
+            result.appendDate( "localTime" , jsTime() );
+
+            {
+                BSONObjBuilder t;
+
+                unsigned long long last, start, timeLocked;
+                d.dbMutex.info().getTimingInfo(start, timeLocked);
+                last = curTimeMicros64();
+                double tt = (double) last-start;
+                double tl = (double) timeLocked;
+                t.append("totalTime", tt);
+                t.append("lockTime", tl);
+                t.append("ratio", (tt ? tl/tt : 0));
+
+                {
+                    BSONObjBuilder ttt( t.subobjStart( "currentQueue" ) );
+                    int w=0, r=0;
+                    Client::recommendedYieldMicros( &w , &r );
+                    ttt.append( "total" , w + r );
+                    ttt.append( "readers" , r );
+                    ttt.append( "writers" , w );
+                    ttt.done();
+                }
+
+                {
+                    BSONObjBuilder ttt( t.subobjStart( "activeClients" ) );
+                    int w=0, r=0;
+                    Client::getActiveClientCount( w , r );
+                    ttt.append( "total" , w + r );
+                    ttt.append( "readers" , r );
+                    ttt.append( "writers" , w );
+                    ttt.done();
+                }
+
+
+
+                result.append( "globalLock" , t.obj() );
+            }
+            timeBuilder.appendNumber( "after basic" , Listener::getElapsedTimeMillis() - start );
+
+            {
+
+                BSONObjBuilder t( result.subobjStart( "mem" ) );
+
+                t.append("bits",  ( sizeof(int*) == 4 ? 32 : 64 ) );
+
+                ProcessInfo p;
+                int v = 0;
+                if ( p.supported() ) {
+                    t.appendNumber( "resident" , p.getResidentSize() );
+                    v = p.getVirtualMemorySize();
+                    t.appendNumber( "virtual" , v );
+                    t.appendBool( "supported" , true );
+                }
+                else {
+                    result.append( "note" , "not all mem info support on this platform" );
+                    t.appendBool( "supported" , false );
+                }
+
+                timeBuilder.appendNumber( "middle of mem" , Listener::getElapsedTimeMillis() - start );
+
+                int m = (int) (MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ));
+                t.appendNumber( "mapped" , m );
+                
+                if ( cmdLine.dur ) {
+                    m *= 2;
+                    t.appendNumber( "mappedWithJournal" , m );
+                }
+                
+                int overhead = v - m - connTicketHolder.used();
+
+                if( overhead > 4000 ) { 
+                    t.append("note", "virtual minus mapped is large. could indicate a memory leak");
+                    log() << "warning: virtual size (" << v << "MB) - mapped size (" << m << "MB) is large (" << overhead << "MB). could indicate a memory leak" << endl;
+                }
+
+                t.done();
+
+            }
+            timeBuilder.appendNumber( "after mem" , Listener::getElapsedTimeMillis() - start );
+
+            {
+                BSONObjBuilder bb( result.subobjStart( "connections" ) );
+                bb.append( "current" , connTicketHolder.used() );
+                bb.append( "available" , connTicketHolder.available() );
+                bb.done();
+            }
+            timeBuilder.appendNumber( "after connections" , Listener::getElapsedTimeMillis() - start );
+
+            {
+                BSONObjBuilder bb( result.subobjStart( "extra_info" ) );
+                bb.append("note", "fields vary by platform");
+                ProcessInfo p;
+                p.getExtraInfo(bb);
+                bb.done();
+                timeBuilder.appendNumber( "after extra info" , Listener::getElapsedTimeMillis() - start );
+
+            }
+
+            {
+                BSONObjBuilder bb( result.subobjStart( "indexCounters" ) );
+                globalIndexCounters.append( bb );
+                bb.done();
+            }
+
+            {
+                BSONObjBuilder bb( result.subobjStart( "backgroundFlushing" ) );
+                globalFlushCounters.append( bb );
+                bb.done();
+            }
+
+            {
+                BSONObjBuilder bb( result.subobjStart( "cursors" ) );
+                ClientCursor::appendStats( bb );
+                bb.done();
+            }
+
+            {
+                BSONObjBuilder bb( result.subobjStart( "network" ) );
+                networkCounter.append( bb );
+                bb.done();
+            }
+
+
+            timeBuilder.appendNumber( "after counters" , Listener::getElapsedTimeMillis() - start );
+
+            if ( anyReplEnabled() ) {
+                BSONObjBuilder bb( result.subobjStart( "repl" ) );
+                appendReplicationInfo( bb , authed , cmdObj["repl"].numberInt() );
+                bb.done();
+
+                if ( ! _isMaster() ) {
+                    result.append( "opcountersRepl" , replOpCounters.getObj() );
+                }
+
+            }
+
+            timeBuilder.appendNumber( "after repl" , Listener::getElapsedTimeMillis() - start );
+
+            result.append( "opcounters" , globalOpCounters.getObj() );
+
+            {
+                BSONObjBuilder asserts( result.subobjStart( "asserts" ) );
+                asserts.append( "regular" , assertionCount.regular );
+                asserts.append( "warning" , assertionCount.warning );
+                asserts.append( "msg" , assertionCount.msg );
+                asserts.append( "user" , assertionCount.user );
+                asserts.append( "rollovers" , assertionCount.rollovers );
+                asserts.done();
+            }
+
+            timeBuilder.appendNumber( "after asserts" , Listener::getElapsedTimeMillis() - start );
+
+            result.append( "writeBacksQueued" , ! writeBackManager.queuesEmpty() );
+
+            if( cmdLine.dur ) {
+                result.append("dur", dur::stats.asObj());
+            }
+
+            timeBuilder.appendNumber( "after dur" , Listener::getElapsedTimeMillis() - start );
+
+            {
+                RamLog* rl = RamLog::get( "warnings" );
+                verify(15880, rl);
+                
+                if (rl->lastWrite() >= time(0)-(10*60)){ // only show warnings from last 10 minutes
+                    vector<const char*> lines;
+                    rl->get( lines );
+                    
+                    BSONArrayBuilder arr( result.subarrayStart( "warnings" ) );
+                    for ( unsigned i=std::max(0,(int)lines.size()-10); i<lines.size(); i++ )
+                        arr.append( lines[i] );
+                    arr.done();
+                }
+            }
+
+            if ( ! authed )
+                result.append( "note" , "run against admin for more info" );
+            
+            timeBuilder.appendNumber( "at end" , Listener::getElapsedTimeMillis() - start );
+            if ( Listener::getElapsedTimeMillis() - start > 1000 ) {
+                BSONObj t = timeBuilder.obj();
+                log() << "serverStatus was very slow: " << t << endl;
+                result.append( "timing" , t );
+            }
+
+            return true;
+        }
+    } cmdServerStatus;
+
+    class CmdGetOpTime : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual void help( stringstream& help ) const { help << "internal"; }
+        virtual LockType locktype() const { return NONE; }
+        CmdGetOpTime() : Command("getoptime") { }
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            writelock l( "" );
+            result.appendDate("optime", OpTime::now().asDate());
+            return true;
+        }
+    } cmdgetoptime;
+
+    /*
+    class Cmd : public Command {
+    public:
+        Cmd() : Command("") { }
+        bool adminOnly() const { return true; }
+        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result) {
+            return true;
+        }
+    } cmd;
+    */
+
+    class CmdDiagLogging : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return true;
+        }
+        CmdDiagLogging() : Command("diagLogging") { }
+        bool adminOnly() const {
+            return true;
+        }
+        void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Monitoring+and+Diagnostics#MonitoringandDiagnostics-DatabaseRecord%2FReplay"; }
+        virtual LockType locktype() const { return WRITE; }
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            int was = _diaglog.setLevel( cmdObj.firstElement().numberInt() );
+            _diaglog.flush();
+            if ( !cmdLine.quiet )
+                tlog() << "CMD: diagLogging set to " << _diaglog.getLevel() << " from: " << was << endl;
+            result.append( "was" , was );
+            return true;
+        }
+    } cmddiaglogging;
+
+    /* remove bit from a bit array - actually remove its slot, not a clear
+       note: this function does not work with x == 63 -- that is ok
+             but keep in mind in the future if max indexes were extended to
+             exactly 64 it would be a problem
+    */
+    unsigned long long removeBit(unsigned long long b, int x) {
+        unsigned long long tmp = b;
+        return
+            (tmp & ((((unsigned long long) 1) << x)-1)) |
+            ((tmp >> (x+1)) << x);
+    }
+
+    struct DBCommandsUnitTest {
+        DBCommandsUnitTest() {
+            assert( removeBit(1, 0) == 0 );
+            assert( removeBit(2, 0) == 1 );
+            assert( removeBit(2, 1) == 0 );
+            assert( removeBit(255, 1) == 127 );
+            assert( removeBit(21, 2) == 9 );
+            assert( removeBit(0x4000000000000001ULL, 62) == 1 );
+        }
+    } dbc_unittest;
+
+    void assureSysIndexesEmptied(const char *ns, IndexDetails *exceptForIdIndex);
+    int removeFromSysIndexes(const char *ns, const char *idxName);
+
+    bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool mayDeleteIdIndex ) {
+
+        BackgroundOperation::assertNoBgOpInProgForNs(ns);
+
+        d = d->writingWithExtra();
+        d->aboutToDeleteAnIndex();
+
+        /* there may be pointers pointing at keys in the btree(s).  kill them. */
+        ClientCursor::invalidate(ns);
+
+        // delete a specific index or all?
+        if ( *name == '*' && name[1] == 0 ) {
+            log(4) << "  d->nIndexes was " << d->nIndexes << '\n';
+            anObjBuilder.append("nIndexesWas", (double)d->nIndexes);
+            IndexDetails *idIndex = 0;
+            if( d->nIndexes ) {
+                for ( int i = 0; i < d->nIndexes; i++ ) {
+                    if ( !mayDeleteIdIndex && d->idx(i).isIdIndex() ) {
+                        idIndex = &d->idx(i);
+                    }
+                    else {
+                        d->idx(i).kill_idx();
+                    }
+                }
+                d->nIndexes = 0;
+            }
+            if ( idIndex ) {
+                d->addIndex(ns) = *idIndex;
+                wassert( d->nIndexes == 1 );
+            }
+            /* assuming here that id index is not multikey: */
+            d->multiKeyIndexBits = 0;
+            assureSysIndexesEmptied(ns, idIndex);
+            anObjBuilder.append("msg", mayDeleteIdIndex ?
+                                "indexes dropped for collection" :
+                                "non-_id indexes dropped for collection");
+        }
+        else {
+            // delete just one index
+            int x = d->findIndexByName(name);
+            if ( x >= 0 ) {
+                log(4) << "  d->nIndexes was " << d->nIndexes << endl;
+                anObjBuilder.append("nIndexesWas", (double)d->nIndexes);
+
+                /* note it is  important we remove the IndexDetails with this
+                 call, otherwise, on recreate, the old one would be reused, and its
+                 IndexDetails::info ptr would be bad info.
+                 */
+                IndexDetails *id = &d->idx(x);
+                if ( !mayDeleteIdIndex && id->isIdIndex() ) {
+                    errmsg = "may not delete _id index";
+                    return false;
+                }
+                id->kill_idx();
+                d->multiKeyIndexBits = removeBit(d->multiKeyIndexBits, x);
+                d->nIndexes--;
+                for ( int i = x; i < d->nIndexes; i++ )
+                    d->idx(i) = d->idx(i+1);
+            }
+            else {
+                int n = removeFromSysIndexes(ns, name); // just in case an orphaned listing there - i.e. should have been repaired but wasn't
+                if( n ) {
+                    log() << "info: removeFromSysIndexes cleaned up " << n << " entries" << endl;
+                }
+                log() << "dropIndexes: " << name << " not found" << endl;
+                errmsg = "index not found";
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /* drop collection */
+    class CmdDrop : public Command {
+    public:
+        CmdDrop() : Command("drop") { }
+        virtual bool logTheOp() {
+            return true;
+        }
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual bool adminOnly() const {
+            return false;
+        }
+        virtual void help( stringstream& help ) const { help << "drop a collection\n{drop : <collectionName>}"; }
+        virtual LockType locktype() const { return WRITE; }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            string nsToDrop = dbname + '.' + cmdObj.firstElement().valuestr();
+            NamespaceDetails *d = nsdetails(nsToDrop.c_str());
+            if ( !cmdLine.quiet )
+                tlog() << "CMD: drop " << nsToDrop << endl;
+            if ( d == 0 ) {
+                errmsg = "ns not found";
+                return false;
+            }
+            uassert( 10039 ,  "can't drop collection with reserved $ character in name", strchr(nsToDrop.c_str(), '$') == 0 );
+            dropCollection( nsToDrop, errmsg, result );
+            return true;
+        }
+    } cmdDrop;
+
+    /* select count(*) */
+    class CmdCount : public Command {
+    public:
+        virtual LockType locktype() const { return READ; }
+        CmdCount() : Command("count") { }
+        virtual bool logTheOp() { return false; }
+        virtual bool slaveOk() const {
+            // ok on --slave setups
+            return replSettings.slave == SimpleSlave;
+        }
+        virtual bool slaveOverrideOk() { return true; }
+        virtual bool maintenanceOk() const { return false; }
+        virtual bool adminOnly() const { return false; }
+        virtual void help( stringstream& help ) const { help << "count objects in collection"; }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            string ns = parseNs(dbname, cmdObj);
+            string err;
+            long long n = runCount(ns.c_str(), cmdObj, err);
+            long long nn = n;
+            bool ok = true;
+            if ( n == -1 ) {
+                nn = 0;
+                result.appendBool( "missing" , true );
+            }
+            else if ( n < 0 ) {
+                nn = 0;
+                ok = false;
+                if ( !err.empty() )
+                    errmsg = err;
+            }
+            result.append("n", (double) nn);
+            return ok;
+        }
+    } cmdCount;
+
+    /* create collection */
+    class CmdCreate : public Command {
+    public:
+        CmdCreate() : Command("create") { }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual bool adminOnly() const {
+            return false;
+        }
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream& help ) const {
+            help << "create a collection explicitly\n"
+                "{ create: <ns>[, capped: <bool>, size: <collSizeInBytes>, max: <nDocs>] }";
+        }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            uassert(15888, "must pass name of collection to create", cmdObj.firstElement().valuestrsafe()[0] != '\0');
+            string ns = dbname + '.' + cmdObj.firstElement().valuestr();
+            string err;
+            uassert(14832, "specify size:<n> when capped is true", !cmdObj["capped"].trueValue() || cmdObj["size"].isNumber() || cmdObj.hasField("$nExtents"));
+            bool ok = userCreateNS(ns.c_str(), cmdObj, err, ! fromRepl );
+            if ( !ok && !err.empty() )
+                errmsg = err;
+            return ok;
+        }
+    } cmdCreate;
+
+    /* "dropIndexes" is now the preferred form - "deleteIndexes" deprecated */
+    class CmdDropIndexes : public Command {
+    public:
+        virtual bool logTheOp() {
+            return true;
+        }
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream& help ) const {
+            help << "drop indexes for a collection";
+        }
+        CmdDropIndexes() : Command("dropIndexes", false, "deleteIndexes") { }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& anObjBuilder, bool /*fromRepl*/) {
+            BSONElement e = jsobj.firstElement();
+            string toDeleteNs = dbname + '.' + e.valuestr();
+            NamespaceDetails *d = nsdetails(toDeleteNs.c_str());
+            if ( !cmdLine.quiet )
+                tlog() << "CMD: dropIndexes " << toDeleteNs << endl;
+            if ( d ) {
+                BSONElement f = jsobj.getField("index");
+                if ( f.type() == String ) {
+                    return dropIndexes( d, toDeleteNs.c_str(), f.valuestr(), errmsg, anObjBuilder, false );
+                }
+                else if ( f.type() == Object ) {
+                    int idxId = d->findIndexByKeyPattern( f.embeddedObject() );
+                    if ( idxId < 0 ) {
+                        errmsg = "can't find index with key:";
+                        errmsg += f.embeddedObject().toString();
+                        return false;
+                    }
+                    else {
+                        IndexDetails& ii = d->idx( idxId );
+                        string iName = ii.indexName();
+                        return dropIndexes( d, toDeleteNs.c_str(), iName.c_str() , errmsg, anObjBuilder, false );
+                    }
+                }
+                else {
+                    errmsg = "invalid index name spec";
+                    return false;
+                }
+            }
+            else {
+                errmsg = "ns not found";
+                return false;
+            }
+        }
+    } cmdDropIndexes;
+
+    class CmdReIndex : public Command {
+    public:
+        virtual bool logTheOp() { return false; } // only reindexes on the one node
+        virtual bool slaveOk() const { return true; }    // can reindex on a secondary
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream& help ) const {
+            help << "re-index a collection";
+        }
+        CmdReIndex() : Command("reIndex") { }
+        bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            static DBDirectClient db;
+
+            BSONElement e = jsobj.firstElement();
+            string toDeleteNs = dbname + '.' + e.valuestr();
+            NamespaceDetails *d = nsdetails(toDeleteNs.c_str());
+            tlog() << "CMD: reIndex " << toDeleteNs << endl;
+            BackgroundOperation::assertNoBgOpInProgForNs(toDeleteNs.c_str());
+
+            if ( ! d ) {
+                errmsg = "ns not found";
+                return false;
+            }
+
+            list<BSONObj> all;
+            auto_ptr<DBClientCursor> i = db.query( dbname + ".system.indexes" , BSON( "ns" << toDeleteNs ) , 0 , 0 , 0 , QueryOption_SlaveOk );
+            BSONObjBuilder b;
+            while ( i->more() ) {
+                BSONObj o = i->next().removeField("v").getOwned();
+                b.append( BSONObjBuilder::numStr( all.size() ) , o );
+                all.push_back( o );
+            }
+
+
+            bool ok = dropIndexes( d, toDeleteNs.c_str(), "*" , errmsg, result, true );
+            if ( ! ok ) {
+                errmsg = "dropIndexes failed";
+                return false;
+            }
+
+            for ( list<BSONObj>::iterator i=all.begin(); i!=all.end(); i++ ) {
+                BSONObj o = *i;
+                log(1) << "reIndex ns: " << toDeleteNs << " index: " << o << endl;
+                theDataFileMgr.insertWithObjMod( Namespace( toDeleteNs.c_str() ).getSisterNS( "system.indexes" ).c_str() , o , true );
+            }
+
+            result.append( "nIndexes" , (int)all.size() );
+            result.appendArray( "indexes" , b.obj() );
+            return true;
+        }
+    } cmdReIndex;
+
+    class CmdListDatabases : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual bool slaveOverrideOk() {
+            return true;
+        }
+        virtual bool adminOnly() const {
+            return true;
+        }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream& help ) const { help << "list databases on this server"; }
+        CmdListDatabases() : Command("listDatabases" , true ) {}
+        bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            vector< string > dbNames;
+            getDatabaseNames( dbNames );
+            vector< BSONObj > dbInfos;
+
+            set<string> seen;
+            boost::intmax_t totalSize = 0;
+            for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
+                BSONObjBuilder b;
+                b.append( "name", *i );
+
+                boost::intmax_t size = dbSize( i->c_str() );
+                b.append( "sizeOnDisk", (double) size );
+                totalSize += size;
+                
+                {
+                    Client::ReadContext rc( *i + ".system.namespaces" );
+                    b.appendBool( "empty", rc.ctx().db()->isEmpty() );
+                }
+                
+                dbInfos.push_back( b.obj() );
+
+                seen.insert( i->c_str() );
+            }
+
+            // TODO: erh 1/1/2010 I think this is broken where path != dbpath ??
+            set<string> allShortNames;
+            {
+                readlock lk;
+                dbHolder().getAllShortNames( false, allShortNames );
+            }
+            
+            for ( set<string>::iterator i = allShortNames.begin(); i != allShortNames.end(); i++ ) {
+                string name = *i;
+
+                if ( seen.count( name ) )
+                    continue;
+
+                BSONObjBuilder b;
+                b.append( "name" , name );
+                b.append( "sizeOnDisk" , (double)1.0 );
+
+                {
+                    readlock lk( name );
+                    Client::Context ctx( name );
+                    b.appendBool( "empty", ctx.db()->isEmpty() );
+                }
+
+                dbInfos.push_back( b.obj() );
+            }
+
+            result.append( "databases", dbInfos );
+            result.append( "totalSize", double( totalSize ) );
+            return true;
+        }
+    } cmdListDatabases;
+
+    /* note an access to a database right after this will open it back up - so this is mainly
+       for diagnostic purposes.
+       */
+    class CmdCloseAllDatabases : public Command {
+    public:
+        virtual void help( stringstream& help ) const { help << "Close all database files.\nA new request will cause an immediate reopening; thus, this is mostly for testing purposes."; }
+        virtual bool adminOnly() const { return true; }
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+
+        CmdCloseAllDatabases() : Command( "closeAllDatabases" ) {}
+        bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            bool ok;
+            try {
+                ok = dbHolderW().closeAll( dbpath , result, false );
+            }
+            catch(DBException&) { 
+                throw;
+            }
+            catch(...) { 
+                log() << "ERROR uncaught exception in command closeAllDatabases" << endl;
+                errmsg = "unexpected uncaught exception";
+                return false;
+            }
+            return ok;
+        }
+    } cmdCloseAllDatabases;
+
+    class CmdFileMD5 : public Command {
+    public:
+        CmdFileMD5() : Command( "filemd5" ) {}
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual void help( stringstream& help ) const {
+            help << " example: { filemd5 : ObjectId(aaaaaaa) , root : \"fs\" }";
+        }
+        virtual LockType locktype() const { return READ; }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            string ns = dbname;
+            ns += ".";
+            {
+                string root = jsobj.getStringField( "root" );
+                if ( root.size() == 0 )
+                    root = "fs";
+                ns += root;
+            }
+            ns += ".chunks"; // make this an option in jsobj
+
+            md5digest d;
+            md5_state_t st;
+            md5_init(&st);
+
+            BSONObj query = BSON( "files_id" << jsobj["filemd5"] );
+            BSONObj sort = BSON( "files_id" << 1 << "n" << 1 );
+
+            shared_ptr<Cursor> cursor = bestGuessCursor(ns.c_str(), query, sort);
+            if ( ! cursor ) {
+                errmsg = "need an index on { files_id : 1 , n : 1 }";
+                return false;
+            }
+            auto_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns.c_str()));
+
+            int n = 0;
+            while ( cursor->ok() ) {
+                if ( ! cursor->matcher()->matchesCurrent( cursor.get() ) ) {
+                    log() << "**** NOT MATCHING ****" << endl;
+                    PRINT(cursor->current());
+                    cursor->advance();
+                    continue;
+                }
+
+                BSONObj obj = cursor->current();
+                cursor->advance();
+
+                BSONElement ne = obj["n"];
+                assert(ne.isNumber());
+                int myn = ne.numberInt();
+                if ( n != myn ) {
+                    log() << "should have chunk: " << n << " have:" << myn << endl;
+                    dumpChunks( ns , query , sort );
+                    uassert( 10040 ,  "chunks out of order" , n == myn );
+                }
+
+                int len;
+                const char * data = obj["data"].binDataClean( len );
+
+                ClientCursor::YieldLock yield (cc.get());
+                try {
+                    md5_append( &st , (const md5_byte_t*)(data) , len );
+                    n++;
+                }
+                catch (...) {
+                    if ( ! yield.stillOk() ) // relocks
+                        cc.release();
+                    throw;
+                }
+
+                if ( ! yield.stillOk() ) {
+                    cc.release();
+                    uasserted(13281, "File deleted during filemd5 command");
+                }
+            }
+
+            md5_finish(&st, d);
+
+            result.append( "numChunks" , n );
+            result.append( "md5" , digestToString( d ) );
+            return true;
+        }
+
+        void dumpChunks( const string& ns , const BSONObj& query , const BSONObj& sort ) {
+            DBDirectClient client;
+            Query q(query);
+            q.sort(sort);
+            auto_ptr<DBClientCursor> c = client.query(ns, q);
+            while(c->more())
+                PRINT(c->nextSafe());
+        }
+    } cmdFileMD5;
+
+    static IndexDetails *cmdIndexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) {
+        if ( ns[ 0 ] == '\0' || min.isEmpty() || max.isEmpty() ) {
+            errmsg = "invalid command syntax (note: min and max are required)";
+            return 0;
+        }
+        return indexDetailsForRange( ns, errmsg, min, max, keyPattern );
+    }
+
+    class CmdDatasize : public Command {
+        virtual string parseNs(const string& dbname, const BSONObj& cmdObj) const { 
+            return parseNsFullyQualified(dbname, cmdObj);
+        }
+    public:
+        CmdDatasize() : Command( "dataSize", false, "datasize" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return READ; }
+        virtual void help( stringstream &help ) const {
+            help <<
+                 "determine data size for a set of data in a certain range"
+                 "\nexample: { dataSize:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }"
+                 "\nkeyPattern, min, and max parameters are optional."
+                 "\nnote: This command may take a while to run";
+        }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            Timer timer;
+
+            string ns = jsobj.firstElement().String();
+            BSONObj min = jsobj.getObjectField( "min" );
+            BSONObj max = jsobj.getObjectField( "max" );
+            BSONObj keyPattern = jsobj.getObjectField( "keyPattern" );
+            bool estimate = jsobj["estimate"].trueValue();
+
+            Client::Context ctx( ns );
+            NamespaceDetails *d = nsdetails(ns.c_str());
+
+            if ( ! d || d->stats.nrecords == 0 ) {
+                result.appendNumber( "size" , 0 );
+                result.appendNumber( "numObjects" , 0 );
+                result.append( "millis" , timer.millis() );
+                return true;
+            }
+
+            result.appendBool( "estimate" , estimate );
+
+            shared_ptr<Cursor> c;
+            if ( min.isEmpty() && max.isEmpty() ) {
+                if ( estimate ) {
+                    result.appendNumber( "size" , d->stats.datasize );
+                    result.appendNumber( "numObjects" , d->stats.nrecords );
+                    result.append( "millis" , timer.millis() );
+                    return 1;
+                }
+                c = theDataFileMgr.findAll( ns.c_str() );
+            }
+            else if ( min.isEmpty() || max.isEmpty() ) {
+                errmsg = "only one of min or max specified";
+                return false;
+            }
+            else {
+                IndexDetails *idx = cmdIndexDetailsForRange( ns.c_str(), errmsg, min, max, keyPattern );
+                if ( idx == 0 )
+                    return false;
+
+                c.reset( BtreeCursor::make( d, d->idxNo(*idx), *idx, min, max, false, 1 ) );
+            }
+
+            long long avgObjSize = d->stats.datasize / d->stats.nrecords;
+
+            long long maxSize = jsobj["maxSize"].numberLong();
+            long long maxObjects = jsobj["maxObjects"].numberLong();
+
+            long long size = 0;
+            long long numObjects = 0;
+            while( c->ok() ) {
+
+                if ( estimate )
+                    size += avgObjSize;
+                else
+                    size += c->currLoc().rec()->netLength();
+
+                numObjects++;
+
+                if ( ( maxSize && size > maxSize ) ||
+                        ( maxObjects && numObjects > maxObjects ) ) {
+                    result.appendBool( "maxReached" , true );
+                    break;
+                }
+
+                c->advance();
+            }
+
+            ostringstream os;
+            os <<  "Finding size for ns: " << ns;
+            if ( ! min.isEmpty() ) {
+                os << " between " << min << " and " << max;
+            }
+            logIfSlow( timer , os.str() );
+
+            result.appendNumber( "size", size );
+            result.appendNumber( "numObjects" , numObjects );
+            result.append( "millis" , timer.millis() );
+            return true;
+        }
+    } cmdDatasize;
+
+    namespace {
+        long long getIndexSizeForCollection(string db, string ns, BSONObjBuilder* details=NULL, int scale = 1 ) {
+            d.dbMutex.assertAtLeastReadLocked();
+
+            NamespaceDetails * nsd = nsdetails( ns.c_str() );
+            if ( ! nsd )
+                return 0;
+
+            long long totalSize = 0;
+
+            NamespaceDetails::IndexIterator ii = nsd->ii();
+            while ( ii.more() ) {
+                IndexDetails& d = ii.next();
+                string collNS = d.indexNamespace();
+                NamespaceDetails * mine = nsdetails( collNS.c_str() );
+                if ( ! mine ) {
+                    log() << "error: have index ["  << collNS << "] but no NamespaceDetails" << endl;
+                    continue;
+                }
+                totalSize += mine->stats.datasize;
+                if ( details )
+                    details->appendNumber( d.indexName() , mine->stats.datasize / scale );
+            }
+            return totalSize;
+        }
+    }
+
+    class CollectionStats : public Command {
+    public:
+        CollectionStats() : Command( "collStats", false, "collstats" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return READ; }
+        virtual void help( stringstream &help ) const {
+            help << "{ collStats:\"blog.posts\" , scale : 1 } scale divides sizes e.g. for KB use 1024\n"
+                    "    avgObjSize - in bytes";
+        }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            string ns = dbname + "." + jsobj.firstElement().valuestr();
+            Client::Context cx( ns );
+
+            NamespaceDetails * nsd = nsdetails( ns.c_str() );
+            if ( ! nsd ) {
+                errmsg = "ns not found";
+                return false;
+            }
+
+            result.append( "ns" , ns.c_str() );
+
+            int scale = 1;
+            if ( jsobj["scale"].isNumber() ) {
+                scale = jsobj["scale"].numberInt();
+                if ( scale <= 0 ) {
+                    errmsg = "scale has to be > 0";
+                    return false;
+                }
+            }
+            else if ( jsobj["scale"].trueValue() ) {
+                errmsg = "scale has to be a number > 0";
+                return false;
+            }
+
+            bool verbose = jsobj["verbose"].trueValue();
+
+            long long size = nsd->stats.datasize / scale;
+            result.appendNumber( "count" , nsd->stats.nrecords );
+            result.appendNumber( "size" , size );
+            if( nsd->stats.nrecords )
+                result.append      ( "avgObjSize" , double(size) / double(nsd->stats.nrecords) );
+
+            int numExtents;
+            BSONArrayBuilder extents;
+
+            result.appendNumber( "storageSize" , nsd->storageSize( &numExtents , verbose ? &extents : 0  ) / scale );
+            result.append( "numExtents" , numExtents );
+            result.append( "nindexes" , nsd->nIndexes );
+            result.append( "lastExtentSize" , nsd->lastExtentSize / scale );
+            result.append( "paddingFactor" , nsd->paddingFactor );
+            result.append( "flags" , nsd->flags );
+
+            BSONObjBuilder indexSizes;
+            result.appendNumber( "totalIndexSize" , getIndexSizeForCollection(dbname, ns, &indexSizes, scale) / scale );
+            result.append("indexSizes", indexSizes.obj());
+
+            if ( nsd->capped ) {
+                result.append( "capped" , nsd->capped );
+                result.append( "max" , nsd->max );
+            }
+
+            if ( verbose )
+                result.appendArray( "extents" , extents.arr() );
+
+            return true;
+        }
+    } cmdCollectionStats;
+
+    class DBStats : public Command {
+    public:
+        DBStats() : Command( "dbStats", false, "dbstats" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return READ; }
+        virtual void help( stringstream &help ) const {
+            help << 
+                "Get stats on a database. Not instantaneous. Slower for databases with large .ns files.\n" << 
+                "Example: { dbStats:1, scale:1 }";
+        }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            int scale = 1;
+            if ( jsobj["scale"].isNumber() ) {
+                scale = jsobj["scale"].numberInt();
+                if ( scale <= 0 ) {
+                    errmsg = "scale has to be > 0";
+                    return false;
+                }
+            }
+            else if ( jsobj["scale"].trueValue() ) {
+                errmsg = "scale has to be a number > 0";
+                return false;
+            }
+
+            list<string> collections;
+            Database* d = cc().database();
+            if ( d )
+                d->namespaceIndex.getNamespaces( collections );
+
+            long long ncollections = 0;
+            long long objects = 0;
+            long long size = 0;
+            long long storageSize = 0;
+            long long numExtents = 0;
+            long long indexes = 0;
+            long long indexSize = 0;
+
+            for (list<string>::const_iterator it = collections.begin(); it != collections.end(); ++it) {
+                const string ns = *it;
+
+                NamespaceDetails * nsd = nsdetails( ns.c_str() );
+                if ( ! nsd ) {
+                    errmsg = "missing ns: ";
+                    errmsg += ns;
+                    return false;
+                }
+
+                ncollections += 1;
+                objects += nsd->stats.nrecords;
+                size += nsd->stats.datasize;
+
+                int temp;
+                storageSize += nsd->storageSize( &temp );
+                numExtents += temp;
+
+                indexes += nsd->nIndexes;
+                indexSize += getIndexSizeForCollection(dbname, ns);
+            }
+            
+            result.append      ( "db" , dbname );
+            result.appendNumber( "collections" , ncollections );
+            result.appendNumber( "objects" , objects );
+            result.append      ( "avgObjSize" , objects == 0 ? 0 : double(size) / double(objects) );
+            result.appendNumber( "dataSize" , size / scale );
+            result.appendNumber( "storageSize" , storageSize / scale);
+            result.appendNumber( "numExtents" , numExtents );
+            result.appendNumber( "indexes" , indexes );
+            result.appendNumber( "indexSize" , indexSize / scale );
+            result.appendNumber( "fileSize" , d->fileSize() / scale );
+            if( d )
+                result.appendNumber( "nsSizeMB", (int) d->namespaceIndex.fileLength() / 1024 / 1024 );
+
+            return true;
+        }
+    } cmdDBStats;
+
+    /* convertToCapped seems to use this */
+    class CmdCloneCollectionAsCapped : public Command {
+    public:
+        CmdCloneCollectionAsCapped() : Command( "cloneCollectionAsCapped" ) {}
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream &help ) const {
+            help << "{ cloneCollectionAsCapped:<fromName>, toCollection:<toName>, size:<sizeInBytes> }";
+        }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            string from = jsobj.getStringField( "cloneCollectionAsCapped" );
+            string to = jsobj.getStringField( "toCollection" );
+            long long size = (long long)jsobj.getField( "size" ).number();
+
+            if ( from.empty() || to.empty() || size == 0 ) {
+                errmsg = "invalid command spec";
+                return false;
+            }
+
+            string fromNs = dbname + "." + from;
+            string toNs = dbname + "." + to;
+            NamespaceDetails *nsd = nsdetails( fromNs.c_str() );
+            massert( 10301 ,  "source collection " + fromNs + " does not exist", nsd );
+            long long excessSize = nsd->stats.datasize - size * 2; // datasize and extentSize can't be compared exactly, so add some padding to 'size'
+            DiskLoc extent = nsd->firstExtent;
+            for( ; excessSize > extent.ext()->length && extent != nsd->lastExtent; extent = extent.ext()->xnext ) {
+                excessSize -= extent.ext()->length;
+                log( 2 ) << "cloneCollectionAsCapped skipping extent of size " << extent.ext()->length << endl;
+                log( 6 ) << "excessSize: " << excessSize << endl;
+            }
+            DiskLoc startLoc = extent.ext()->firstRecord;
+
+            CursorId id;
+            {
+                shared_ptr<Cursor> c = theDataFileMgr.findAll( fromNs.c_str(), startLoc );
+                ClientCursor *cc = new ClientCursor(0, c, fromNs.c_str());
+                id = cc->cursorid();
+            }
+
+            DBDirectClient client;
+            Client::Context ctx( toNs );
+            BSONObjBuilder spec;
+            spec.appendBool( "capped", true );
+            spec.append( "size", double( size ) );
+            if ( !userCreateNS( toNs.c_str(), spec.done(), errmsg, true ) )
+                return false;
+
+            auto_ptr< DBClientCursor > c = client.getMore( fromNs, id );
+            while( c->more() ) {
+                BSONObj obj = c->next();
+                theDataFileMgr.insertAndLog( toNs.c_str(), obj, true );
+                getDur().commitIfNeeded();
+            }
+
+            return true;
+        }
+    } cmdCloneCollectionAsCapped;
+
+    /* jan2010:
+       Converts the given collection to a capped collection w/ the specified size.
+       This command is not highly used, and is not currently supported with sharded
+       environments.
+       */
+    class CmdConvertToCapped : public Command {
+    public:
+        CmdConvertToCapped() : Command( "convertToCapped" ) {}
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        virtual void help( stringstream &help ) const {
+            help << "{ convertToCapped:<fromCollectionName>, size:<sizeInBytes> }";
+        }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            BackgroundOperation::assertNoBgOpInProgForDb(dbname.c_str());
+
+            string from = jsobj.getStringField( "convertToCapped" );
+            long long size = (long long)jsobj.getField( "size" ).number();
+
+            if ( from.empty() || size == 0 ) {
+                errmsg = "invalid command spec";
+                return false;
+            }
+
+            string shortTmpName = str::stream() << ".tmp.convertToCapped." << from;
+            string longTmpName = str::stream() << dbname << "." << shortTmpName;
+
+            DBDirectClient client;
+            client.dropCollection( longTmpName );
+
+            BSONObj info;
+            if ( !client.runCommand( dbname ,
+                                     BSON( "cloneCollectionAsCapped" << from << "toCollection" << shortTmpName << "size" << double( size ) ),
+                                     info ) ) {
+                errmsg = "cloneCollectionAsCapped failed: " + info.toString();
+                return false;
+            }
+
+            if ( !client.dropCollection( dbname + "." + from ) ) {
+                errmsg = "failed to drop original collection";
+                return false;
+            }
+
+            if ( !client.runCommand( "admin",
+                                     BSON( "renameCollection" << longTmpName <<
+                                           "to" << ( dbname + "." + from ) ),
+                                     info ) ) {
+                errmsg = "renameCollection failed: " + info.toString();
+                return false;
+            }
+
+            return true;
+        }
+    } cmdConvertToCapped;
+
+    /* Returns client's uri */
+    class CmdWhatsMyUri : public Command {
+    public:
+        CmdWhatsMyUri() : Command("whatsmyuri") { }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "{whatsmyuri:1}";
+        }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            BSONObj info = cc().curop()->infoNoauth();
+            result << "you" << info[ "client" ];
+            return true;
+        }
+    } cmdWhatsMyUri;
+
+    /* For testing only, not for general use */
+    class GodInsert : public Command {
+    public:
+        GodInsert() : Command( "godinsert" ) { }
+        virtual bool adminOnly() const { return false; }
+        virtual bool logTheOp() { return false; }
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool requiresAuth() { return true; }
+        virtual void help( stringstream &help ) const {
+            help << "internal. for testing only.";
+        }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+
+            AuthenticationInfo *ai = cc().getAuthenticationInfo();
+            if ( ! ai->isLocalHost ) {
+                errmsg = "godinsert only works locally";
+                return false;
+            }
+
+            string coll = cmdObj[ "godinsert" ].valuestrsafe();
+            log() << "test only command godinsert invoked coll:" << coll << endl;
+            uassert( 13049, "godinsert must specify a collection", !coll.empty() );
+            string ns = dbname + "." + coll;
+            BSONObj obj = cmdObj[ "obj" ].embeddedObjectUserCheck();
+            {
+                dblock lk;
+                Client::Context ctx( ns );
+                theDataFileMgr.insertWithObjMod( ns.c_str(), obj, true );
+            }
+            return true;
+        }
+    } cmdGodInsert;
+    
+    class DBHashCmd : public Command {
+    public:
+        DBHashCmd() : Command( "dbHash", false, "dbhash" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return READ; }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            list<string> colls;
+            Database* db = cc().database();
+            if ( db )
+                db->namespaceIndex.getNamespaces( colls );
+            colls.sort();
+
+            result.appendNumber( "numCollections" , (long long)colls.size() );
+            result.append( "host" , prettyHostName() );
+
+            md5_state_t globalState;
+            md5_init(&globalState);
+
+            BSONObjBuilder bb( result.subobjStart( "collections" ) );
+            for ( list<string>::iterator i=colls.begin(); i != colls.end(); i++ ) {
+                string c = *i;
+                if ( c.find( ".system.profil" ) != string::npos )
+                    continue;
+
+                shared_ptr<Cursor> cursor;
+
+                NamespaceDetails * nsd = nsdetails( c.c_str() );
+
+                // debug SERVER-761
+                NamespaceDetails::IndexIterator ii = nsd->ii();
+                while( ii.more() ) {
+                    const IndexDetails &idx = ii.next();
+                    if ( !idx.head.isValid() || !idx.info.isValid() ) {
+                        log() << "invalid index for ns: " << c << " " << idx.head << " " << idx.info;
+                        if ( idx.info.isValid() )
+                            log() << " " << idx.info.obj();
+                        log() << endl;
+                    }
+                }
+
+                int idNum = nsd->findIdIndex();
+                if ( idNum >= 0 ) {
+                    cursor.reset( BtreeCursor::make( nsd , idNum , nsd->idx( idNum ) , BSONObj() , BSONObj() , false , 1 ) );
+                }
+                else if ( c.find( ".system." ) != string::npos ) {
+                    continue;
+                }
+                else if ( nsd->capped ) {
+                    cursor = findTableScan( c.c_str() , BSONObj() );
+                }
+                else {
+                    log() << "can't find _id index for: " << c << endl;
+                    continue;
+                }
+
+                md5_state_t st;
+                md5_init(&st);
+
+                long long n = 0;
+                while ( cursor->ok() ) {
+                    BSONObj c = cursor->current();
+                    md5_append( &st , (const md5_byte_t*)c.objdata() , c.objsize() );
+                    n++;
+                    cursor->advance();
+                }
+                md5digest d;
+                md5_finish(&st, d);
+                string hash = digestToString( d );
+
+                bb.append( c.c_str() + ( dbname.size() + 1 ) , hash );
+
+                md5_append( &globalState , (const md5_byte_t*)hash.c_str() , hash.size() );
+            }
+            bb.done();
+
+            md5digest d;
+            md5_finish(&globalState, d);
+            string hash = digestToString( d );
+
+            result.append( "md5" , hash );
+
+            return 1;
+        }
+
+    } dbhashCmd;
+
+    /* for diagnostic / testing purposes. */
+    class CmdSleep : public Command {
+    public:
+        virtual LockType locktype() const { return NONE; }
+        virtual bool adminOnly() const { return true; }
+        virtual bool logTheOp() { return false; }
+        virtual bool slaveOk() const { return true; }
+        virtual void help( stringstream& help ) const {
+            help << "internal testing command.  Makes db block (in a read lock) for 100 seconds\n";
+            help << "w:true write lock. secs:<seconds>";
+        }
+        CmdSleep() : Command("sleep") { }
+        bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            log() << "test only command sleep invoked" << endl;
+            int secs = 100;
+            if ( cmdObj["secs"].isNumber() )
+                secs = cmdObj["secs"].numberInt();
+            if( cmdObj.getBoolField("w") ) {
+                writelock lk("");
+                sleepsecs(secs);
+            }
+            else {
+                readlock lk("");
+                sleepsecs(secs);
+            }
+            return true;
+        }
+    } cmdSleep;
+
+    // just for testing
+    class CapTrunc : public Command {
+    public:
+        CapTrunc() : Command( "captrunc" ) {}
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        virtual bool requiresAuth() { return true; }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            string coll = cmdObj[ "captrunc" ].valuestrsafe();
+            uassert( 13416, "captrunc must specify a collection", !coll.empty() );
+            string ns = dbname + "." + coll;
+            int n = cmdObj.getIntField( "n" );
+
+            // inclusive range?
+            bool inc = cmdObj.getBoolField( "inc" );
+            NamespaceDetails *nsd = nsdetails( ns.c_str() );
+            ReverseCappedCursor c( nsd );
+            massert( 13417, "captrunc collection not found or empty", c.ok() );
+            for( int i = 0; i < n; ++i ) {
+                massert( 13418, "captrunc invalid n", c.advance() );
+            }
+            DiskLoc end = c.currLoc();
+            nsd->cappedTruncateAfter( ns.c_str(), end, inc );
+            return true;
+        }
+    } capTruncCmd;
+
+    // just for testing
+    class EmptyCapped : public Command {
+    public:
+        EmptyCapped() : Command( "emptycapped" ) {}
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        virtual bool requiresAuth() { return true; }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            string coll = cmdObj[ "emptycapped" ].valuestrsafe();
+            uassert( 13428, "emptycapped must specify a collection", !coll.empty() );
+            string ns = dbname + "." + coll;
+            NamespaceDetails *nsd = nsdetails( ns.c_str() );
+            massert( 13429, "emptycapped no such collection", nsd );
+            nsd->emptyCappedCollection( ns.c_str() );
+            return true;
+        }
+    } emptyCappedCmd;
+
+    bool _execCommand(Command *c, const string& dbname, BSONObj& cmdObj, int queryOptions, BSONObjBuilder& result, bool fromRepl) {
+
+        try {
+            string errmsg;
+            if ( ! c->run(dbname, cmdObj, queryOptions, errmsg, result, fromRepl ) ) {
+                result.append( "errmsg" , errmsg );
+                return false;
+            }
+        }
+        catch ( SendStaleConfigException& e ){
+            log(1) << "command failed because of stale config, can retry" << causedBy( e ) << endl;
+            throw;
+        }
+        catch ( DBException& e ) {
+
+            // TODO: Rethrown errors have issues here, should divorce SendStaleConfigException from the DBException tree
+
+            stringstream ss;
+            ss << "exception: " << e.what();
+            result.append( "errmsg" , ss.str() );
+            result.append( "code" , e.getCode() );
+            return false;
+        }
+
+        return true;
+    }
+
+    /**
+     * this handles
+     - auth
+     - maintenance mode
+     - locking
+     - context
+     then calls run()
+    */
+    bool execCommand( Command * c ,
+                      Client& client , int queryOptions ,
+                      const char *cmdns, BSONObj& cmdObj ,
+                      BSONObjBuilder& result,
+                      bool fromRepl ) {
+
+        string dbname = nsToDatabase( cmdns );
+
+        AuthenticationInfo *ai = client.getAuthenticationInfo();
+
+        if( c->adminOnly() && c->localHostOnlyIfNoAuth( cmdObj ) && noauth && !ai->isLocalHost ) {
+            result.append( "errmsg" ,
+                           "unauthorized: this command must run from localhost when running db without auth" );
+            log() << "command denied: " << cmdObj.toString() << endl;
+            return false;
+        }
+
+        if ( c->adminOnly() && ! fromRepl && dbname != "admin" ) {
+            result.append( "errmsg" ,  "access denied; use admin db" );
+            log() << "command denied: " << cmdObj.toString() << endl;
+            return false;
+        }
+
+        if ( cmdObj["help"].trueValue() ) {
+            client.curop()->ensureStarted();
+            stringstream ss;
+            ss << "help for: " << c->name << " ";
+            c->help( ss );
+            result.append( "help" , ss.str() );
+            result.append( "lockType" , c->locktype() );
+            return true;
+        }
+
+        bool canRunHere =
+            isMaster( dbname.c_str() ) ||
+            c->slaveOk() ||
+            ( c->slaveOverrideOk() && ( queryOptions & QueryOption_SlaveOk ) ) ||
+            fromRepl;
+
+        if ( ! canRunHere ) {
+            result.append( "errmsg" , "not master" );
+            result.append( "note" , "from execCommand" );
+            return false;
+        }
+
+        if ( ! c->maintenanceOk() && theReplSet && ! isMaster( dbname.c_str() ) && ! theReplSet->isSecondary() ) {
+            result.append( "errmsg" , "node is recovering" );
+            result.append( "note" , "from execCommand" );
+            return false;
+        }
+
+        if ( c->adminOnly() )
+            log( 2 ) << "command: " << cmdObj << endl;
+
+        if (c->maintenanceMode() && theReplSet && theReplSet->isSecondary()) {
+            theReplSet->setMaintenanceMode(true);
+        }
+
+        bool retval = false;
+        if ( c->locktype() == Command::NONE ) {
+            // we also trust that this won't crash
+            retval = true;
+
+            if ( c->requiresAuth() ) {
+                // test that the user at least as read permissions
+                if ( ! client.getAuthenticationInfo()->isAuthorizedReads( dbname ) ) {
+                    result.append( "errmsg" , "need to login" );
+                    retval = false;
+                }
+            }
+
+            if (retval) {
+                client.curop()->ensureStarted();
+                retval = _execCommand(c, dbname , cmdObj , queryOptions, result , fromRepl );
+            }
+        }
+        else if( c->locktype() != Command::WRITE ) { 
+            // read lock
+            assert( ! c->logTheOp() );
+            string ns = c->parseNs(dbname, cmdObj);
+            Client::ReadContext ctx( ns , dbpath, c->requiresAuth() ); // read locks
+            client.curop()->ensureStarted();
+            retval = _execCommand(c, dbname , cmdObj , queryOptions, result , fromRepl );
+        }
+        else {
+            dassert( c->locktype() == Command::WRITE );
+            writelock lk;
+            client.curop()->ensureStarted();
+            Client::Context ctx( dbname , dbpath , c->requiresAuth() );
+            retval = _execCommand(c, dbname , cmdObj , queryOptions, result , fromRepl );
+            if ( retval && c->logTheOp() && ! fromRepl ) {
+                logOp("c", cmdns, cmdObj);
+            }
+        }
+
+        if (c->maintenanceMode() && theReplSet) {
+            theReplSet->setMaintenanceMode(false);
+        }
+
+        return retval;
+    }
+
+
+    /* TODO make these all command objects -- legacy stuff here
+
+       usage:
+         abc.$cmd.findOne( { ismaster:1 } );
+
+       returns true if ran a cmd
+    */
+    bool _runCommands(const char *ns, BSONObj& _cmdobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
+        string dbname = nsToDatabase( ns );
+
+        if( logLevel >= 1 )
+            log() << "run command " << ns << ' ' << _cmdobj << endl;
+
+        const char *p = strchr(ns, '.');
+        if ( !p ) return false;
+        if ( strcmp(p, ".$cmd") != 0 ) return false;
+
+        BSONObj jsobj;
+        {
+            BSONElement e = _cmdobj.firstElement();
+            if ( e.type() == Object && (e.fieldName()[0] == '$'
+                                         ? str::equals("query", e.fieldName()+1)
+                                         : str::equals("query", e.fieldName())))
+            {
+                jsobj = e.embeddedObject();
+            }
+            else {
+                jsobj = _cmdobj;
+            }
+        }
+
+        Client& client = cc();
+        bool ok = false;
+
+        BSONElement e = jsobj.firstElement();
+
+        Command * c = e.type() ? Command::findCommand( e.fieldName() ) : 0;
+
+        if ( c ) {
+            ok = execCommand( c , client , queryOptions , ns , jsobj , anObjBuilder , fromRepl );
+        }
+        else {
+            anObjBuilder.append("errmsg", str::stream() << "no such cmd: " << e.fieldName() );
+            anObjBuilder.append("bad cmd" , _cmdobj );
+        }
+
+        // switch to bool, but wait a bit longer before switching?
+        // anObjBuilder.append("ok", ok);
+        anObjBuilder.append("ok", ok?1.0:0.0);
+        BSONObj x = anObjBuilder.done();
+        b.appendBuf((void*) x.objdata(), x.objsize());
+
+        return true;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/dbcommands_admin.cpp b/src/mongo/db/dbcommands_admin.cpp
new file mode 100644
index 00000000000..ffcc3f261fe
--- /dev/null
+++ b/src/mongo/db/dbcommands_admin.cpp
@@ -0,0 +1,550 @@
+// dbcommands_admin.cpp
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+   this file has dbcommands that are for dba type administration
+   mostly around dbs and collections
+   NOT system stuff
+*/
+
+
+#include "pch.h"
+#include "jsobj.h"
+#include "pdfile.h"
+#include "namespace-inl.h"
+#include "commands.h"
+#include "cmdline.h"
+#include "btree.h"
+#include "curop-inl.h"
+#include "../util/background.h"
+#include "../util/logfile.h"
+#include "../util/alignedbuilder.h"
+#include "../util/paths.h"
+#include "../scripting/engine.h"
+#include "../util/timer.h"
+
+namespace mongo {
+
+    class CleanCmd : public Command {
+    public:
+        CleanCmd() : Command( "clean" ) {}
+
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return WRITE; }
+
+        virtual void help(stringstream& h) const { h << "internal"; }
+
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            string dropns = dbname + "." + cmdObj.firstElement().valuestrsafe();
+
+            if ( !cmdLine.quiet )
+                tlog() << "CMD: clean " << dropns << endl;
+
+            NamespaceDetails *d = nsdetails(dropns.c_str());
+
+            if ( ! d ) {
+                errmsg = "ns not found";
+                return 0;
+            }
+
+            for ( int i = 0; i < Buckets; i++ )
+                d->deletedList[i].Null();
+
+            result.append("ns", dropns.c_str());
+            return 1;
+        }
+
+    } cleanCmd;
+
+    namespace dur {
+        boost::filesystem::path getJournalDir();
+    }
+ 
+    class JournalLatencyTestCmd : public Command {
+    public:
+        JournalLatencyTestCmd() : Command( "journalLatencyTest" ) {}
+
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool adminOnly() const { return true; }
+        virtual void help(stringstream& h) const { h << "test how long to write and fsync to a test file in the journal/ directory"; }
+
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            boost::filesystem::path p = dur::getJournalDir();
+            p /= "journalLatencyTest";
+        
+            // remove file if already present
+            try { 
+                remove(p);
+            }
+            catch(...) { }
+
+            BSONObjBuilder bb[2];
+            for( int pass = 0; pass < 2; pass++ ) {
+                LogFile f(p.string());
+                AlignedBuilder b(1024 * 1024);
+                {
+                    Timer t;
+                    for( int i = 0 ; i < 100; i++ ) { 
+                        f.synchronousAppend(b.buf(), 8192);
+                    }
+                    bb[pass].append("8KB", t.millis() / 100.0);
+                }
+                {
+                    const int N = 50;
+                    Timer t2;
+                    long long x = 0;
+                    for( int i = 0 ; i < N; i++ ) { 
+                        Timer t;
+                        f.synchronousAppend(b.buf(), 8192);
+                        x += t.micros();
+                        sleepmillis(4);
+                    }
+                    long long y = t2.micros() - 4*N*1000;
+                    // not really trusting the timer granularity on all platforms so whichever is higher of x and y
+                    bb[pass].append("8KBWithPauses", max(x,y) / (N*1000.0));
+                }
+                {
+                    Timer t;
+                    for( int i = 0 ; i < 20; i++ ) { 
+                        f.synchronousAppend(b.buf(), 1024 * 1024);
+                    }
+                    bb[pass].append("1MB", t.millis() / 20.0);
+                }
+                // second time around, we are prealloced.
+            }
+            result.append("timeMillis", bb[0].obj());
+            result.append("timeMillisWithPrealloc", bb[1].obj());
+
+            try { 
+                remove(p);
+            }
+            catch(...) { }
+
+            try {
+                result.append("onSamePartition", onSamePartition(dur::getJournalDir().string(), dbpath));
+            }
+            catch(...) { }
+
+            return 1;
+        }
+    } journalLatencyTestCmd;
+
+    class ValidateCmd : public Command {
+    public:
+        ValidateCmd() : Command( "validate" ) {}
+
+        virtual bool slaveOk() const {
+            return true;
+        }
+
+        virtual void help(stringstream& h) const { h << "Validate contents of a namespace by scanning its data structures for correctness.  Slow.\n"
+                                                        "Add full:true option to do a more thorough check"; }
+
+        virtual LockType locktype() const { return READ; }
+        //{ validate: "collectionnamewithoutthedbpart" [, scandata: <bool>] [, full: <bool> } */
+
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            string ns = dbname + "." + cmdObj.firstElement().valuestrsafe();
+            NamespaceDetails * d = nsdetails( ns.c_str() );
+            if ( !cmdLine.quiet )
+                tlog() << "CMD: validate " << ns << endl;
+
+            if ( ! d ) {
+                errmsg = "ns not found";
+                return 0;
+            }
+
+            result.append( "ns", ns );
+            validateNS( ns.c_str() , d, cmdObj, result);
+            return 1;
+        }
+
+    private:
+        void validateNS(const char *ns, NamespaceDetails *d, const BSONObj& cmdObj, BSONObjBuilder& result) {
+            const bool full = cmdObj["full"].trueValue();
+            const bool scanData = full || cmdObj["scandata"].trueValue();
+
+            bool valid = true;
+            BSONArrayBuilder errors; // explanation(s) for why valid = false
+            if ( d->capped ){
+                result.append("capped", d->capped);
+                result.append("max", d->max);
+            }
+
+            result.append("firstExtent", str::stream() << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.toString());
+            result.append( "lastExtent", str::stream() <<  d->lastExtent.toString() << " ns:" <<  d->lastExtent.ext()->nsDiagnostic.toString());
+            
+            BSONArrayBuilder extentData;
+
+            try {
+                d->firstExtent.ext()->assertOk();
+                d->lastExtent.ext()->assertOk();
+
+                DiskLoc el = d->firstExtent;
+                int ne = 0;
+                while( !el.isNull() ) {
+                    Extent *e = el.ext();
+                    e->assertOk();
+                    el = e->xnext;
+                    ne++;
+                    if ( full )
+                        extentData << e->dump();
+                    
+                    killCurrentOp.checkForInterrupt();
+                }
+                result.append("extentCount", ne);
+            }
+            catch (...) {
+                valid=false;
+                errors << "extent asserted";
+            }
+
+            if ( full )
+                result.appendArray( "extents" , extentData.arr() );
+
+            
+            result.appendNumber("datasize", d->stats.datasize);
+            result.appendNumber("nrecords", d->stats.nrecords);
+            result.appendNumber("lastExtentSize", d->lastExtentSize);
+            result.appendNumber("padding", d->paddingFactor);
+            
+
+            try {
+
+                try {
+                    result.append("firstExtentDetails", d->firstExtent.ext()->dump());
+
+                    valid = valid && d->firstExtent.ext()->validates() && 
+                        d->firstExtent.ext()->xprev.isNull();
+                }
+                catch (...) {
+                    errors << "exception firstextent";
+                    valid = false;
+                }
+
+                set<DiskLoc> recs;
+                if( scanData ) {
+                    shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
+                    int n = 0;
+                    int nInvalid = 0;
+                    long long len = 0;
+                    long long nlen = 0;
+                    int outOfOrder = 0;
+                    DiskLoc cl_last;
+                    while ( c->ok() ) {
+                        n++;
+
+                        DiskLoc cl = c->currLoc();
+                        if ( n < 1000000 )
+                            recs.insert(cl);
+                        if ( d->capped ) {
+                            if ( cl < cl_last )
+                                outOfOrder++;
+                            cl_last = cl;
+                        }
+
+                        Record *r = c->_current();
+                        len += r->lengthWithHeaders;
+                        nlen += r->netLength();
+
+                        if (full){
+                            BSONObj obj(r);
+                            if (!obj.isValid() || !obj.valid()){ // both fast and deep checks
+                                valid = false;
+                                if (nInvalid == 0) // only log once;
+                                    errors << "invalid bson object detected (see logs for more info)";
+
+                                nInvalid++;
+                                if (strcmp("_id", obj.firstElementFieldName()) == 0){
+                                    try {
+                                        obj.firstElement().validate(); // throws on error
+                                        log() << "Invalid bson detected in " << ns << " with _id: " << obj.firstElement().toString(false) << endl;
+                                    }
+                                    catch(...){
+                                        log() << "Invalid bson detected in " << ns << " with corrupt _id" << endl;
+                                    }
+                                }
+                                else {
+                                    log() << "Invalid bson detected in " << ns << " and couldn't find _id" << endl;
+                                }
+                            }
+                        }
+
+                        c->advance();
+                    }
+                    if ( d->capped && !d->capLooped() ) {
+                        result.append("cappedOutOfOrder", outOfOrder);
+                        if ( outOfOrder > 1 ) {
+                            valid = false;
+                            errors << "too many out of order records";
+                        }
+                    }
+                    result.append("objectsFound", n);
+
+                    if (full) {
+                        result.append("invalidObjects", nInvalid);
+                    }
+
+                    result.appendNumber("bytesWithHeaders", len);
+                    result.appendNumber("bytesWithoutHeaders", nlen);
+                }
+
+                BSONArrayBuilder deletedListArray;
+                for ( int i = 0; i < Buckets; i++ ) {
+                    deletedListArray << d->deletedList[i].isNull();
+                }
+
+                int ndel = 0;
+                long long delSize = 0;
+                int incorrect = 0;
+                for ( int i = 0; i < Buckets; i++ ) {
+                    DiskLoc loc = d->deletedList[i];
+                    try {
+                        int k = 0;
+                        while ( !loc.isNull() ) {
+                            if ( recs.count(loc) )
+                                incorrect++;
+                            ndel++;
+
+                            if ( loc.questionable() ) {
+                                if( d->capped && !loc.isValid() && i == 1 ) {
+                                    /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid
+                                       see comments in namespace.h
+                                    */
+                                    break;
+                                }
+
+                                if ( loc.a() <= 0 || strstr(ns, "hudsonSmall") == 0 ) {
+                                    string err (str::stream() << "bad deleted loc: " << loc.toString() << " bucket:" << i << " k:" << k);
+                                    errors << err;
+
+                                    valid = false;
+                                    break;
+                                }
+                            }
+
+                            DeletedRecord *d = loc.drec();
+                            delSize += d->lengthWithHeaders;
+                            loc = d->nextDeleted;
+                            k++;
+                            killCurrentOp.checkForInterrupt();
+                        }
+                    }
+                    catch (...) {
+                        errors << ("exception in deleted chain for bucket " + BSONObjBuilder::numStr(i));
+                        valid = false;
+                    }
+                }
+                result.appendNumber("deletedCount", ndel);
+                result.appendNumber("deletedSize", delSize);
+
+                if ( incorrect ) {
+                    errors << (BSONObjBuilder::numStr(incorrect) + " records from datafile are in deleted list");
+                    valid = false;
+                }
+
+                int idxn = 0;
+                try  {
+                    result.append("nIndexes", d->nIndexes);
+                    BSONObjBuilder indexes; // not using subObjStart to be exception safe
+                    NamespaceDetails::IndexIterator i = d->ii();
+                    while( i.more() ) {
+                        IndexDetails& id = i.next();
+                        long long keys = id.idxInterface().fullValidate(id.head, id.keyPattern());
+                        indexes.appendNumber(id.indexNamespace(), keys);
+                    }
+                    result.append("keysPerIndex", indexes.done());
+                }
+                catch (...) {
+                    errors << ("exception during index validate idxn " + BSONObjBuilder::numStr(idxn));
+                    valid=false;
+                }
+
+            }
+            catch (AssertionException) {
+                errors << "exception during validate";
+                valid = false;
+            }
+
+            result.appendBool("valid", valid);
+            result.append("errors", errors.arr());
+
+            if ( !full ){
+                result.append("warning", "Some checks omitted for speed. use {full:true} option to do more thorough scan.");
+            }
+            
+            if ( !valid ) {
+                result.append("advice", "ns corrupt, requires repair");
+            }
+
+        }
+    } validateCmd;
+
+    bool lockedForWriting = false; // read from db/instance.cpp
+    static bool unlockRequested = false;
+    static mongo::mutex fsyncLockMutex("fsyncLock");
+    static boost::condition fsyncLockCondition;
+    static OID fsyncLockID; // identifies the current lock job
+
+    /*
+        class UnlockCommand : public Command {
+        public:
+            UnlockCommand() : Command( "unlock" ) { }
+            virtual bool readOnly() { return true; }
+            virtual bool slaveOk() const { return true; }
+            virtual bool adminOnly() const { return true; }
+            virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+                if( lockedForWriting ) {
+                    log() << "command: unlock requested" << endl;
+                    errmsg = "unlock requested";
+                    unlockRequested = true;
+                }
+                else {
+                    errmsg = "not locked, so cannot unlock";
+                    return 0;
+                }
+                return 1;
+            }
+
+        } unlockCommand;
+    */
+    /* see unlockFsync() for unlocking:
+       db.$cmd.sys.unlock.findOne()
+    */
+    class FSyncCommand : public Command {
+        static const char* url() { return  "http://www.mongodb.org/display/DOCS/fsync+Command"; }
+        class LockDBJob : public BackgroundJob {
+        protected:
+            virtual string name() const { return "lockdbjob"; }
+            void run() {
+                Client::initThread("fsyncjob");
+                Client& c = cc();
+                {
+                    scoped_lock lk(fsyncLockMutex);
+                    while (lockedForWriting){ // there is a small window for two LockDBJob's to be active. This prevents it.
+                        fsyncLockCondition.wait(lk.boost());
+                    }
+                    lockedForWriting = true;
+                    fsyncLockID.init();
+                }
+                readlock lk("");
+                MemoryMappedFile::flushAll(true);
+                log() << "db is now locked for snapshotting, no writes allowed. db.fsyncUnlock() to unlock" << endl;
+                log() << "    For more info see " << FSyncCommand::url() << endl;
+                _ready = true;
+                {
+                    scoped_lock lk(fsyncLockMutex);
+                    while( !unlockRequested ) {
+                        fsyncLockCondition.wait(lk.boost());
+                    }
+                    unlockRequested = false;
+                    lockedForWriting = false;
+                    fsyncLockCondition.notify_all();
+                }
+                c.shutdown();
+            }
+        public:
+            bool& _ready;
+            LockDBJob(bool& ready) : BackgroundJob( true /* delete self */ ), _ready(ready) {
+                _ready = false;
+            }
+        };
+    public:
+        FSyncCommand() : Command( "fsync" ) {}
+        virtual LockType locktype() const { return WRITE; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        /*virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) {
+            string x = cmdObj["exec"].valuestrsafe();
+            return !x.empty();
+        }*/
+        virtual void help(stringstream& h) const { h << url(); }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            bool sync = !cmdObj["async"].trueValue(); // async means do an fsync, but return immediately
+            bool lock = cmdObj["lock"].trueValue();
+            log() << "CMD fsync:  sync:" << sync << " lock:" << lock << endl;
+
+            if( lock ) {
+                // fsync and lock variation 
+
+                uassert(12034, "fsync: can't lock while an unlock is pending", !unlockRequested);
+                uassert(12032, "fsync: sync option must be true when using lock", sync);
+                /* With releaseEarly(), we must be extremely careful we don't do anything
+                   where we would have assumed we were locked.  profiling is one of those things.
+                   Perhaps at profile time we could check if we released early -- however,
+                   we need to be careful to keep that code very fast it's a very common code path when on.
+                */
+                uassert(12033, "fsync: profiling must be off to enter locked mode", cc().database()->profile == 0);
+
+                // todo future: Perhaps we could do this in the background thread.  As is now, writes may interleave between 
+                //              the releaseEarly below and the acquisition of the readlock in the background thread. 
+                //              However the real problem is that it seems complex to unlock here and then have a window for 
+                //              writes before the bg job -- can be done correctly but harder to reason about correctness.
+                //              If this command ran within a read lock in the first place, would it work, and then that 
+                //              would be quite easy?
+                //              Or, could we downgrade the write lock to a read lock, wait for ready, then release?
+                getDur().syncDataAndTruncateJournal();
+
+                bool ready = false;
+                LockDBJob *l = new LockDBJob(ready);
+
+                d.dbMutex.releaseEarly();
+                
+                // There is a narrow window for another lock request to come in
+                // here before the LockDBJob grabs the readlock. LockDBJob will
+                // ensure that the requests are serialized and never running
+                // concurrently
+
+                l->go();
+                // don't return until background thread has acquired the read lock
+                while( !ready ) {
+                    sleepmillis(10);
+                }
+                result.append("info", "now locked against writes, use db.fsyncUnlock() to unlock");
+                result.append("seeAlso", url());
+            }
+            else {
+                // the simple fsync command case
+
+                if (sync)
+                    getDur().commitNow();
+                result.append( "numFiles" , MemoryMappedFile::flushAll( sync ) );
+            }
+            return 1;
+        }
+
+    } fsyncCmd;
+
+    // Note that this will only unlock the current lock.  If another thread
+    // relocks before we return we still consider the unlocking successful.
+    // This is imporant because if two scripts are trying to fsync-lock, each
+    // one must be assured that between the fsync return and the call to unlock
+    // that the database is fully locked
+    void unlockFsyncAndWait(){
+        scoped_lock lk(fsyncLockMutex);
+        if (lockedForWriting) { // could have handled another unlock before we grabbed the lock
+            OID curOp = fsyncLockID;
+            unlockRequested = true;
+            fsyncLockCondition.notify_all();
+            while (lockedForWriting && fsyncLockID == curOp){
+                fsyncLockCondition.wait( lk.boost() );
+            }
+        }
+    }
+}
+
diff --git a/src/mongo/db/dbcommands_generic.cpp b/src/mongo/db/dbcommands_generic.cpp
new file mode 100644
index 00000000000..cfd833aa72d
--- /dev/null
+++ b/src/mongo/db/dbcommands_generic.cpp
@@ -0,0 +1,432 @@
+/** @file dbcommands_generic.cpp commands suited for any mongo server (both mongod, mongos) */
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "ops/query.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "../bson/util/builder.h"
+#include <time.h>
+#include "introspect.h"
+#include "btree.h"
+#include "../util/lruishmap.h"
+#include "../util/md5.hpp"
+#include "../util/processinfo.h"
+#include "json.h"
+#include "repl.h"
+#include "repl_block.h"
+#include "replutil.h"
+#include "commands.h"
+#include "db.h"
+#include "instance.h"
+#include "lasterror.h"
+#include "security.h"
+#include "../scripting/engine.h"
+#include "stats/counters.h"
+#include "background.h"
+#include "../util/version.h"
+#include "../util/ramlog.h"
+#include "repl/multicmd.h"
+#include "server.h"
+
+namespace mongo {
+
+#if 0
+    namespace cloud {
+        SimpleMutex mtx("cloud");
+        Guarded< vector<string>, mtx > ips;
+        bool startedThread = false;
+
+        void thread() { 
+            bson::bo cmd;
+            while( 1 ) {
+                list<Target> L;
+                {
+                    SimpleMutex::scoped_lock lk(mtx);
+                    if( ips.ref(lk).empty() )
+                        continue;
+                    for( unsigned i = 0; i < ips.ref(lk).size(); i++ ) { 
+                        L.push_back( Target(ips.ref(lk)[i]) );
+                    }
+                }
+
+
+                /** repoll as machines might be down on the first lookup (only if not found previously) */
+                sleepsecs(6); 
+            }
+        }
+    }
+
+    class CmdCloud : public Command {
+    public:
+        CmdCloud() : Command( "cloud" ) { }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "internal command facilitating running in certain cloud computing environments";
+        }
+        bool run(const string& dbname, BSONObj& obj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            if( !obj.hasElement("servers") ) { 
+                vector<string> ips;
+                obj["servers"].Obj().Vals(ips);
+                {
+                    SimpleMutex::scoped_lock lk(cloud::mtx);
+                    cloud::ips.ref(lk).swap(ips);
+                    if( !cloud::startedThread ) {
+                        cloud::startedThread = true;
+                        boost::thread thr(cloud::thread);
+                    }
+                }
+            }
+            return true;
+        }
+    } cmdCloud;
+#endif
+
+    class CmdBuildInfo : public Command {
+    public:
+        CmdBuildInfo() : Command( "buildInfo", true, "buildinfo" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return false; }
+        virtual bool requiresAuth() { return false; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "get version #, etc.\n";
+            help << "{ buildinfo:1 }";
+        }
+        bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            result << "version" << versionString << "gitVersion" << gitVersion() << "sysInfo" << sysInfo();
+            result << "versionArray" << versionArray;
+            result << "bits" << ( sizeof( int* ) == 4 ? 32 : 64 );
+            result.appendBool( "debug" , debug );
+            result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize);
+            return true;
+        }
+    } cmdBuildInfo;
+
+    /** experimental. either remove or add support in repl sets also.  in a repl set, getting this setting from the
+        repl set config could make sense.
+        */
+    unsigned replApplyBatchSize = 1;
+
+    class CmdGet : public Command {
+    public:
+        CmdGet() : Command( "getParameter" ) { }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "get administrative option(s)\nexample:\n";
+            help << "{ getParameter:1, notablescan:1 }\n";
+            help << "supported so far:\n";
+            help << "  quiet\n";
+            help << "  notablescan\n";
+            help << "  logLevel\n";
+            help << "  syncdelay\n";
+            help << "{ getParameter:'*' } to get everything\n";
+        }
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            bool all = *cmdObj.firstElement().valuestrsafe() == '*';
+
+            int before = result.len();
+
+            if( all || cmdObj.hasElement("quiet") ) {
+                result.append("quiet", cmdLine.quiet );
+            }
+            if( all || cmdObj.hasElement("notablescan") ) {
+                result.append("notablescan", cmdLine.noTableScan);
+            }
+            if( all || cmdObj.hasElement("logLevel") ) {
+                result.append("logLevel", logLevel);
+            }
+            if( all || cmdObj.hasElement("syncdelay") ) {
+                result.append("syncdelay", cmdLine.syncdelay);
+            }
+            if( all || cmdObj.hasElement("replApplyBatchSize") ) {
+                result.append("replApplyBatchSize", replApplyBatchSize);
+            }
+
+            if ( before == result.len() ) {
+                errmsg = "no option found to get";
+                return false;
+            }
+            return true;
+        }
+    } cmdGet;
+
+    // tempish
+    bool setParmsMongodSpecific(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl );
+
+    class CmdSet : public Command {
+    public:
+        CmdSet() : Command( "setParameter" ) { }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const {
+            help << "set administrative option(s)\n";
+            help << "{ setParameter:1, <param>:<value> }\n";
+            help << "supported so far:\n";
+            help << "  journalCommitInterval\n";
+            help << "  logLevel\n";
+            help << "  notablescan\n";
+            help << "  quiet\n";
+            help << "  syncdelay\n";
+        }
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
+            int s = 0;
+            bool found = setParmsMongodSpecific(dbname, cmdObj, errmsg, result, fromRepl);
+            if( cmdObj.hasElement("journalCommitInterval") ) { 
+                if( !cmdLine.dur ) { 
+                    errmsg = "journaling is off";
+                    return false;
+                }
+                int x = (int) cmdObj["journalCommitInterval"].Number();
+                assert( x > 1 && x < 500 );
+                cmdLine.journalCommitInterval = x;
+                log() << "setParameter journalCommitInterval=" << x << endl;
+                s++;
+            }
+            if( cmdObj.hasElement("notablescan") ) {
+                assert( !cmdLine.isMongos() );
+                if( s == 0 )
+                    result.append("was", cmdLine.noTableScan);
+                cmdLine.noTableScan = cmdObj["notablescan"].Bool();
+                s++;
+            }
+            if( cmdObj.hasElement("quiet") ) {
+                if( s == 0 )
+                    result.append("was", cmdLine.quiet );
+                cmdLine.quiet = cmdObj["quiet"].Bool();
+                s++;
+            }
+            if( cmdObj.hasElement("syncdelay") ) {
+                assert( !cmdLine.isMongos() );
+                if( s == 0 )
+                    result.append("was", cmdLine.syncdelay );
+                cmdLine.syncdelay = cmdObj["syncdelay"].Number();
+                s++;
+            }
+            if( cmdObj.hasElement( "logLevel" ) ) {
+                if( s == 0 )
+                    result.append("was", logLevel );
+                logLevel = cmdObj["logLevel"].numberInt();
+                s++;
+            }
+            if( cmdObj.hasElement( "replApplyBatchSize" ) ) {
+                if( s == 0 )
+                    result.append("was", replApplyBatchSize );
+                BSONElement e = cmdObj["replApplyBatchSize"];
+                ParameterValidator * v = ParameterValidator::get( e.fieldName() );
+                assert( v );
+                if ( ! v->isValid( e , errmsg ) )
+                    return false;
+                replApplyBatchSize = e.numberInt();
+                s++;
+            }
+            if( cmdObj.hasElement( "traceExceptions" ) ) {
+                if( s == 0 ) result.append( "was", DBException::traceExceptions );
+                DBException::traceExceptions = cmdObj["traceExceptions"].Bool();
+                s++;
+            }
+
+            if( s == 0 && !found ) {
+                errmsg = "no option found to set, use help:true to see options ";
+                return false;
+            }
+
+            return true;
+        }
+    } cmdSet;
+
+    class PingCommand : public Command {
+    public:
+        PingCommand() : Command( "ping" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual void help( stringstream &help ) const { help << "a way to check that the server is alive. responds immediately even if server is in a db lock."; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool requiresAuth() { return false; }
+        virtual bool run(const string& badns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            // IMPORTANT: Don't put anything in here that might lock db - including authentication
+            return true;
+        }
+    } pingCmd;
+
+    class FeaturesCmd : public Command {
+    public:
+        FeaturesCmd() : Command( "features", true ) {}
+        void help(stringstream& h) const { h << "return build level feature settings"; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool readOnly() { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( globalScriptEngine ) {
+                BSONObjBuilder bb( result.subobjStart( "js" ) );
+                result.append( "utf8" , globalScriptEngine->utf8Ok() );
+                bb.done();
+            }
+            if ( cmdObj["oidReset"].trueValue() ) {
+                result.append( "oidMachineOld" , OID::getMachineId() );
+                OID::regenMachineId();
+            }
+            result.append( "oidMachine" , OID::getMachineId() );
+            return true;
+        }
+
+    } featuresCmd;
+
+    class LogRotateCmd : public Command {
+    public:
+        LogRotateCmd() : Command( "logRotate" ) {}
+        virtual LockType locktype() const { return NONE; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            rotateLogs();
+            return 1;
+        }
+
+    } logRotateCmd;
+
+    class ListCommandsCmd : public Command {
+    public:
+        virtual void help( stringstream &help ) const { help << "get a list of all db commands"; }
+        ListCommandsCmd() : Command( "listCommands", false ) {}
+        virtual LockType locktype() const { return NONE; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return false; }
+        virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            BSONObjBuilder b( result.subobjStart( "commands" ) );
+            for ( map<string,Command*>::iterator i=_commands->begin(); i!=_commands->end(); ++i ) {
+                Command * c = i->second;
+
+                // don't show oldnames
+                if (i->first != c->name)
+                    continue;
+
+                BSONObjBuilder temp( b.subobjStart( c->name ) );
+
+                {
+                    stringstream help;
+                    c->help( help );
+                    temp.append( "help" , help.str() );
+                }
+                temp.append( "lockType" , c->locktype() );
+                temp.append( "slaveOk" , c->slaveOk() );
+                temp.append( "adminOnly" , c->adminOnly() );
+                //optionally indicates that the command can be forced to run on a slave/secondary
+                if ( c->slaveOverrideOk() ) temp.append( "slaveOverrideOk" , c->slaveOverrideOk() );
+                temp.done();
+            }
+            b.done();
+
+            return 1;
+        }
+
+    } listCommandsCmd;
+
+    bool CmdShutdown::shutdownHelper() {
+        Client * c = currentClient.get();
+        if ( c ) {
+            c->shutdown();
+        }
+
+        log() << "terminating, shutdown command received" << endl;
+
+        dbexit( EXIT_CLEAN , "shutdown called" , true ); // this never returns
+        assert(0);
+        return true;
+    }
+
+    /* for testing purposes only */
+    class CmdForceError : public Command {
+    public:
+        virtual void help( stringstream& help ) const {
+            help << "for testing purposes only.  forces a user assertion exception";
+        }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual LockType locktype() const { return NONE; }
+        CmdForceError() : Command("forceerror") {}
+        bool run(const string& dbnamne, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            uassert( 10038 , "forced error", false);
+            return true;
+        }
+    } cmdForceError;
+
+    class AvailableQueryOptions : public Command {
+    public:
+        AvailableQueryOptions() : Command( "availableQueryOptions" , false , "availablequeryoptions" ) {}
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            result << "options" << QueryOption_AllSupported;
+            return true;
+        }
+    } availableQueryOptionsCmd;
+
+    class GetLogCmd : public Command {
+    public:
+        GetLogCmd() : Command( "getLog" ){}
+
+        virtual bool slaveOk() const { return true; }
+        virtual LockType locktype() const { return NONE; }
+        virtual bool adminOnly() const { return true; }
+
+        virtual void help( stringstream& help ) const {
+            help << "{ getLog : '*' }  OR { getLog : 'global' }";
+        }
+
+        virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
+            string p = cmdObj.firstElement().String();
+            if ( p == "*" ) {
+                vector<string> names;
+                RamLog::getNames( names );
+
+                BSONArrayBuilder arr;
+                for ( unsigned i=0; i<names.size(); i++ ) {
+                    arr.append( names[i] );
+                }
+                
+                result.appendArray( "names" , arr.arr() );
+            }
+            else {
+                RamLog* rl = RamLog::get( p );
+                if ( ! rl ) {
+                    errmsg = str::stream() << "no RamLog named: " << p;
+                    return false;
+                }
+                
+                vector<const char*> lines;
+                rl->get( lines );
+                
+                BSONArrayBuilder arr( result.subarrayStart( "log" ) );
+                for ( unsigned i=0; i<lines.size(); i++ )
+                    arr.append( lines[i] );
+                arr.done();
+            }
+            return true;
+        }
+
+    } getLogCmd;
+
+}
diff --git a/src/mongo/db/dbeval.cpp b/src/mongo/db/dbeval.cpp
new file mode 100644
index 00000000000..9e77d8c8097
--- /dev/null
+++ b/src/mongo/db/dbeval.cpp
@@ -0,0 +1,136 @@
+/* commands.cpp
+   db "commands" (sent via db.$cmd.findOne(...))
+ */
+
+/**
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "ops/query.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "../bson/util/builder.h"
+#include <time.h>
+#include "introspect.h"
+#include "btree.h"
+#include "../util/lruishmap.h"
+#include "json.h"
+#include "repl.h"
+#include "commands.h"
+#include "cmdline.h"
+
+#include "../scripting/engine.h"
+
+namespace mongo {
+
+    const int edebug=0;
+
+    bool dbEval(const string& dbName, BSONObj& cmd, BSONObjBuilder& result, string& errmsg) {
+        BSONElement e = cmd.firstElement();
+        uassert( 10046 ,  "eval needs Code" , e.type() == Code || e.type() == CodeWScope || e.type() == String );
+
+        const char *code = 0;
+        switch ( e.type() ) {
+        case String:
+        case Code:
+            code = e.valuestr();
+            break;
+        case CodeWScope:
+            code = e.codeWScopeCode();
+            break;
+        default:
+            assert(0);
+        }
+        assert( code );
+
+        if ( ! globalScriptEngine ) {
+            errmsg = "db side execution is disabled";
+            return false;
+        }
+
+        auto_ptr<Scope> s = globalScriptEngine->getPooledScope( dbName );
+        ScriptingFunction f = s->createFunction(code);
+        if ( f == 0 ) {
+            errmsg = (string)"compile failed: " + s->getError();
+            return false;
+        }
+
+        if ( e.type() == CodeWScope )
+            s->init( e.codeWScopeScopeData() );
+        s->localConnect( dbName.c_str() );
+
+        BSONObj args;
+        {
+            BSONElement argsElement = cmd.getField("args");
+            if ( argsElement.type() == Array ) {
+                args = argsElement.embeddedObject();
+                if ( edebug ) {
+                    out() << "args:" << args.toString() << endl;
+                    out() << "code:\n" << code << endl;
+                }
+            }
+        }
+
+        int res;
+        {
+            Timer t;
+            res = s->invoke(f, &args, 0, cmdLine.quota ? 10 * 60 * 1000 : 0 );
+            int m = t.millis();
+            if ( m > cmdLine.slowMS ) {
+                out() << "dbeval slow, time: " << dec << m << "ms " << dbName << endl;
+                if ( m >= 1000 ) log() << code << endl;
+                else OCCASIONALLY log() << code << endl;
+            }
+        }
+        if ( res ) {
+            result.append("errno", (double) res);
+            errmsg = "invoke failed: ";
+            errmsg += s->getError();
+            return false;
+        }
+
+        s->append( result , "retval" , "return" );
+
+        return true;
+    }
+
+    class CmdEval : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return false;
+        }
+        virtual void help( stringstream &help ) const {
+            help << "Evaluate javascript at the server.\n" "http://www.mongodb.org/display/DOCS/Server-side+Code+Execution";
+        }
+        virtual LockType locktype() const { return NONE; }
+        CmdEval() : Command("eval", false, "$eval") { }
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+
+            AuthenticationInfo *ai = cc().getAuthenticationInfo();
+            uassert( 12598 , "$eval reads unauthorized", ai->isAuthorizedReads(dbname.c_str()) );
+
+            if ( cmdObj["nolock"].trueValue() ) {
+                return dbEval(dbname, cmdObj, result, errmsg);
+            }
+
+            // write security will be enforced in DBDirectClient
+            mongolock lk( ai->isAuthorized( dbname.c_str() ) );
+            Client::Context ctx( dbname );
+
+            return dbEval(dbname, cmdObj, result, errmsg);
+        }
+    } cmdeval;
+
+} // namespace mongo
diff --git a/src/mongo/db/dbhelpers.cpp b/src/mongo/db/dbhelpers.cpp
new file mode 100644
index 00000000000..39540c9ce89
--- /dev/null
+++ b/src/mongo/db/dbhelpers.cpp
@@ -0,0 +1,353 @@
+// dbhelpers.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "json.h"
+#include "queryoptimizer.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "oplog.h"
+#include "ops/update.h"
+#include "ops/delete.h"
+
+namespace mongo {
+
+    void Helpers::ensureIndex(const char *ns, BSONObj keyPattern, bool unique, const char *name) {
+        NamespaceDetails *d = nsdetails(ns);
+        if( d == 0 )
+            return;
+
+        {
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                if( i.next().keyPattern().woCompare(keyPattern) == 0 )
+                    return;
+            }
+        }
+
+        if( d->nIndexes >= NamespaceDetails::NIndexesMax ) {
+            problem() << "Helper::ensureIndex fails, MaxIndexes exceeded " << ns << '\n';
+            return;
+        }
+
+        string system_indexes = cc().database()->name + ".system.indexes";
+
+        BSONObjBuilder b;
+        b.append("name", name);
+        b.append("ns", ns);
+        b.append("key", keyPattern);
+        b.appendBool("unique", unique);
+        BSONObj o = b.done();
+
+        theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize());
+    }
+
+    /* fetch a single object from collection ns that matches query
+       set your db SavedContext first
+    */
+    bool Helpers::findOne(const char *ns, const BSONObj &query, BSONObj& result, bool requireIndex) {
+        DiskLoc loc = findOne( ns, query, requireIndex );
+        if ( loc.isNull() )
+            return false;
+        result = loc.obj();
+        return true;
+    }
+
+    /* fetch a single object from collection ns that matches query
+       set your db SavedContext first
+    */
+    DiskLoc Helpers::findOne(const char *ns, const BSONObj &query, bool requireIndex) {
+        shared_ptr<Cursor> c = NamespaceDetailsTransient::getCursor( ns, query, BSONObj(), requireIndex );
+        while( c->ok() ) {
+            if ( c->currentMatches() && !c->getsetdup( c->currLoc() ) ) {
+                return c->currLoc();
+            }
+            c->advance();
+        }
+        return DiskLoc();
+    }
+
+    bool Helpers::findById(Client& c, const char *ns, BSONObj query, BSONObj& result ,
+                           bool * nsFound , bool * indexFound ) {
+        d.dbMutex.assertAtLeastReadLocked();
+        Database *database = c.database();
+        assert( database );
+        NamespaceDetails *d = database->namespaceIndex.details(ns);
+        if ( ! d )
+            return false;
+        if ( nsFound )
+            *nsFound = 1;
+
+        int idxNo = d->findIdIndex();
+        if ( idxNo < 0 )
+            return false;
+        if ( indexFound )
+            *indexFound = 1;
+
+        IndexDetails& i = d->idx( idxNo );
+
+        BSONObj key = i.getKeyFromQuery( query );
+
+        DiskLoc loc = i.idxInterface().findSingle(i , i.head , key);
+        if ( loc.isNull() )
+            return false;
+        result = loc.obj();
+        return true;
+    }
+
+    DiskLoc Helpers::findById(NamespaceDetails *d, BSONObj idquery) {
+        assert(d);
+        int idxNo = d->findIdIndex();
+        uassert(13430, "no _id index", idxNo>=0);
+        IndexDetails& i = d->idx( idxNo );
+        BSONObj key = i.getKeyFromQuery( idquery );
+        return i.idxInterface().findSingle(i , i.head , key);
+    }
+
+    bool Helpers::isEmpty(const char *ns, bool doAuth) {
+        Client::Context context(ns, dbpath, doAuth);
+        shared_ptr<Cursor> c = DataFileMgr::findAll(ns);
+        return !c->ok();
+    }
+
+    /* Get the first object from a collection.  Generally only useful if the collection
+       only ever has a single object -- which is a "singleton collection.
+
+       Returns: true if object exists.
+    */
+    bool Helpers::getSingleton(const char *ns, BSONObj& result) {
+        Client::Context context(ns);
+
+        shared_ptr<Cursor> c = DataFileMgr::findAll(ns);
+        if ( !c->ok() ) {
+            context.getClient()->curop()->done();
+            return false;
+        }
+
+        result = c->current();
+        context.getClient()->curop()->done();
+        return true;
+    }
+
+    bool Helpers::getLast(const char *ns, BSONObj& result) {
+        Client::Context ctx(ns);
+        shared_ptr<Cursor> c = findTableScan(ns, reverseNaturalObj);
+        if( !c->ok() )
+            return false;
+        result = c->current();
+        return true;
+    }
+
+    void Helpers::upsert( const string& ns , const BSONObj& o ) {
+        BSONElement e = o["_id"];
+        assert( e.type() );
+        BSONObj id = e.wrap();
+
+        OpDebug debug;
+        Client::Context context(ns);
+        updateObjects(ns.c_str(), o, /*pattern=*/id, /*upsert=*/true, /*multi=*/false , /*logtheop=*/true , debug );
+    }
+
+    void Helpers::putSingleton(const char *ns, BSONObj obj) {
+        OpDebug debug;
+        Client::Context context(ns);
+        updateObjects(ns, obj, /*pattern=*/BSONObj(), /*upsert=*/true, /*multi=*/false , /*logtheop=*/true , debug );
+        context.getClient()->curop()->done();
+    }
+
+    void Helpers::putSingletonGod(const char *ns, BSONObj obj, bool logTheOp) {
+        OpDebug debug;
+        Client::Context context(ns);
+        _updateObjects(/*god=*/true, ns, obj, /*pattern=*/BSONObj(), /*upsert=*/true, /*multi=*/false , logTheOp , debug );
+        context.getClient()->curop()->done();
+    }
+
+    BSONObj Helpers::toKeyFormat( const BSONObj& o , BSONObj& key ) {
+        BSONObjBuilder me;
+        BSONObjBuilder k;
+
+        BSONObjIterator i( o );
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            k.append( e.fieldName() , 1 );
+            me.appendAs( e , "" );
+        }
+        key = k.obj();
+        return me.obj();
+    }
+
+    long long Helpers::removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield , bool maxInclusive , RemoveCallback * callback ) {
+        BSONObj keya , keyb;
+        BSONObj minClean = toKeyFormat( min , keya );
+        BSONObj maxClean = toKeyFormat( max , keyb );
+        assert( keya == keyb );
+
+        Client::Context ctx(ns);
+        NamespaceDetails* nsd = nsdetails( ns.c_str() );
+        if ( ! nsd )
+            return 0;
+
+        int ii = nsd->findIndexByKeyPattern( keya );
+        assert( ii >= 0 );
+
+        long long num = 0;
+
+        IndexDetails& i = nsd->idx( ii );
+
+        shared_ptr<Cursor> c( BtreeCursor::make( nsd , ii , i , minClean , maxClean , maxInclusive, 1 ) );
+        auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
+        cc->setDoingDeletes( true );
+
+        while ( c->ok() ) {
+
+            if ( yield && ! cc->yieldSometimes( ClientCursor::WillNeed) ) {
+                // cursor got finished by someone else, so we're done
+                cc.release(); // if the collection/db is dropped, cc may be deleted
+                break;
+            }
+
+            if ( ! c->ok() )
+                break;
+
+            DiskLoc rloc = c->currLoc();
+
+            if ( callback )
+                callback->goingToDelete( c->current() );
+
+            c->advance();
+            c->noteLocation();
+
+            logOp( "d" , ns.c_str() , rloc.obj()["_id"].wrap() );
+            theDataFileMgr.deleteRecord(ns.c_str() , rloc.rec(), rloc);
+            num++;
+
+            c->checkLocation();
+
+            getDur().commitIfNeeded();
+
+
+        }
+
+        return num;
+    }
+
+    void Helpers::emptyCollection(const char *ns) {
+        Client::Context context(ns);
+        deleteObjects(ns, BSONObj(), false);
+    }
+
+    DbSet::~DbSet() {
+        if ( name_.empty() )
+            return;
+        try {
+            Client::Context c( name_.c_str() );
+            if ( nsdetails( name_.c_str() ) ) {
+                string errmsg;
+                BSONObjBuilder result;
+                dropCollection( name_, errmsg, result );
+            }
+        }
+        catch ( ... ) {
+            problem() << "exception cleaning up DbSet" << endl;
+        }
+    }
+
+    void DbSet::reset( const string &name, const BSONObj &key ) {
+        if ( !name.empty() )
+            name_ = name;
+        if ( !key.isEmpty() )
+            key_ = key.getOwned();
+        Client::Context c( name_.c_str() );
+        if ( nsdetails( name_.c_str() ) ) {
+            Helpers::emptyCollection( name_.c_str() );
+        }
+        else {
+            string err;
+            massert( 10303 ,  err, userCreateNS( name_.c_str(), fromjson( "{autoIndexId:false}" ), err, false ) );
+        }
+        Helpers::ensureIndex( name_.c_str(), key_, true, "setIdx" );
+    }
+
+    bool DbSet::get( const BSONObj &obj ) const {
+        Client::Context c( name_.c_str() );
+        BSONObj temp;
+        return Helpers::findOne( name_.c_str(), obj, temp, true );
+    }
+
+    void DbSet::set( const BSONObj &obj, bool val ) {
+        Client::Context c( name_.c_str() );
+        if ( val ) {
+            try {
+                BSONObj k = obj;
+                theDataFileMgr.insertWithObjMod( name_.c_str(), k, false );
+            }
+            catch ( DBException& ) {
+                // dup key - already in set
+            }
+        }
+        else {
+            deleteObjects( name_.c_str(), obj, true, false, false );
+        }
+    }
+
+    RemoveSaver::RemoveSaver( const string& a , const string& b , const string& why) : _out(0) {
+        static int NUM = 0;
+
+        _root = dbpath;
+        if ( a.size() )
+            _root /= a;
+        if ( b.size() )
+            _root /= b;
+        assert( a.size() || b.size() );
+
+        _file = _root;
+
+        stringstream ss;
+        ss << why << "." << terseCurrentTime(false) << "." << NUM++ << ".bson";
+        _file /= ss.str();
+
+    }
+
+    RemoveSaver::~RemoveSaver() {
+        if ( _out ) {
+            _out->close();
+            delete _out;
+            _out = 0;
+        }
+    }
+
+    void RemoveSaver::goingToDelete( const BSONObj& o ) {
+        if ( ! _out ) {
+            create_directories( _root );
+            _out = new ofstream();
+            _out->open( _file.string().c_str() , ios_base::out | ios_base::binary );
+            if ( ! _out->good() ) {
+                log( LL_WARNING ) << "couldn't create file: " << _file.string() << " for remove saving" << endl;
+                delete _out;
+                _out = 0;
+                return;
+            }
+
+        }
+        _out->write( o.objdata() , o.objsize() );
+    }
+
+
+} // namespace mongo
diff --git a/src/mongo/db/dbhelpers.h b/src/mongo/db/dbhelpers.h
new file mode 100644
index 00000000000..99d401fa1f8
--- /dev/null
+++ b/src/mongo/db/dbhelpers.h
@@ -0,0 +1,159 @@
+/* @file dbhelpers.h
+
+   db helpers are helper functions and classes that let us easily manipulate the local
+   database instance in-proc.
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "client.h"
+#include "db.h"
+
+namespace mongo {
+
+    const BSONObj reverseNaturalObj = BSON( "$natural" << -1 );
+
+    class Cursor;
+    class CoveredIndexMatcher;
+
+    /**
+       all helpers assume locking is handled above them
+     */
+    struct Helpers {
+
+        /* ensure the specified index exists.
+
+           @param keyPattern key pattern, e.g., { ts : 1 }
+           @param name index name, e.g., "name_1"
+
+           This method can be a little (not much) cpu-slow, so you may wish to use
+             OCCASIONALLY ensureIndex(...);
+
+           Note: use ensureHaveIdIndex() for the _id index: it is faster.
+           Note: does nothing if collection does not yet exist.
+        */
+        static void ensureIndex(const char *ns, BSONObj keyPattern, bool unique, const char *name);
+
+        /* fetch a single object from collection ns that matches query.
+           set your db SavedContext first.
+
+           @param query - the query to perform.  note this is the low level portion of query so "orderby : ..."
+                          won't work.
+
+           @param requireIndex if true, assert if no index for the query.  a way to guard against
+           writing a slow query.
+
+           @return true if object found
+        */
+        static bool findOne(const char *ns, const BSONObj &query, BSONObj& result, bool requireIndex = false);
+        static DiskLoc findOne(const char *ns, const BSONObj &query, bool requireIndex);
+
+        /**
+         * @param foundIndex if passed in will be set to 1 if ns and index found
+         * @return true if object found
+         */
+        static bool findById(Client&, const char *ns, BSONObj query, BSONObj& result ,
+                             bool * nsFound = 0 , bool * indexFound = 0 );
+
+        /* uasserts if no _id index.
+           @return null loc if not found */
+        static DiskLoc findById(NamespaceDetails *d, BSONObj query);
+
+        /** Get/put the first (or last) object from a collection.  Generally only useful if the collection
+            only ever has a single object -- which is a "singleton collection".
+
+            You do not need to set the database (Context) before calling.
+
+            @return true if object exists.
+        */
+        static bool getSingleton(const char *ns, BSONObj& result);
+        static void putSingleton(const char *ns, BSONObj obj);
+        static void putSingletonGod(const char *ns, BSONObj obj, bool logTheOp);
+        static bool getFirst(const char *ns, BSONObj& result) { return getSingleton(ns, result); }
+        static bool getLast(const char *ns, BSONObj& result); // get last object int he collection; e.g. {$natural : -1}
+
+        /**
+         * you have to lock
+         * you do not have to have Context set
+         * o has to have an _id field or will assert
+         */
+        static void upsert( const string& ns , const BSONObj& o );
+
+        /** You do not need to set the database before calling.
+            @return true if collection is empty.
+        */
+        static bool isEmpty(const char *ns, bool doAuth=true);
+
+        // TODO: this should be somewhere else probably
+        static BSONObj toKeyFormat( const BSONObj& o , BSONObj& key );
+
+        class RemoveCallback {
+        public:
+            virtual ~RemoveCallback() {}
+            virtual void goingToDelete( const BSONObj& o ) = 0;
+        };
+        /* removeRange: operation is oplog'd */
+        static long long removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield = false , bool maxInclusive = false , RemoveCallback * callback = 0 );
+
+        /* Remove all objects from a collection.
+        You do not need to set the database before calling.
+        */
+        static void emptyCollection(const char *ns);
+
+    };
+
+    class Database;
+
+    // manage a set using collection backed storage
+    class DbSet {
+    public:
+        DbSet( const string &name = "", const BSONObj &key = BSONObj() ) :
+            name_( name ),
+            key_( key.getOwned() ) {
+        }
+        ~DbSet();
+        void reset( const string &name = "", const BSONObj &key = BSONObj() );
+        bool get( const BSONObj &obj ) const;
+        void set( const BSONObj &obj, bool val );
+    private:
+        string name_;
+        BSONObj key_;
+    };
+
+
+    /**
+     * user for saving deleted bson objects to a flat file
+     */
+    class RemoveSaver : public Helpers::RemoveCallback , boost::noncopyable {
+    public:
+        RemoveSaver( const string& type , const string& ns , const string& why);
+        ~RemoveSaver();
+
+        void goingToDelete( const BSONObj& o );
+
+    private:
+        path _root;
+        path _file;
+        ofstream* _out;
+
+    };
+
+
+} // namespace mongo
diff --git a/src/mongo/db/dbmessage.cpp b/src/mongo/db/dbmessage.cpp
new file mode 100644
index 00000000000..c86b5a05240
--- /dev/null
+++ b/src/mongo/db/dbmessage.cpp
@@ -0,0 +1,108 @@
+// dbmessage.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "dbmessage.h"
+#include "../client/dbclient.h"
+
+namespace mongo {
+
+    string Message::toString() const {
+        stringstream ss;
+        ss << "op: " << opToString( operation() ) << " len: " << size();
+        if ( operation() >= 2000 && operation() < 2100 ) {
+            DbMessage d(*this);
+            ss << " ns: " << d.getns();
+            switch ( operation() ) {
+            case dbUpdate: {
+                int flags = d.pullInt();
+                BSONObj q = d.nextJsObj();
+                BSONObj o = d.nextJsObj();
+                ss << " flags: " << flags << " query: " << q << " update: " << o;
+                break;
+            }
+            case dbInsert:
+                ss << d.nextJsObj();
+                break;
+            case dbDelete: {
+                int flags = d.pullInt();
+                BSONObj q = d.nextJsObj();
+                ss << " flags: " << flags << " query: " << q;
+                break;
+            }
+            default:
+                ss << " CANNOT HANDLE YET";
+            }
+
+
+        }
+        return ss.str();
+    }
+
+
+    void replyToQuery(int queryResultFlags,
+                      AbstractMessagingPort* p, Message& requestMsg,
+                      void *data, int size,
+                      int nReturned, int startingFrom,
+                      long long cursorId 
+                      ) {
+        BufBuilder b(32768);
+        b.skip(sizeof(QueryResult));
+        b.appendBuf(data, size);
+        QueryResult *qr = (QueryResult *) b.buf();
+        qr->_resultFlags() = queryResultFlags;
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->cursorId = cursorId;
+        qr->startingFrom = startingFrom;
+        qr->nReturned = nReturned;
+        b.decouple();
+        Message resp(qr, true);
+        p->reply(requestMsg, resp, requestMsg.header()->id);
+    }
+
+    void replyToQuery(int queryResultFlags,
+                      AbstractMessagingPort* p, Message& requestMsg,
+                      BSONObj& responseObj) {
+        replyToQuery(queryResultFlags,
+                     p, requestMsg,
+                     (void *) responseObj.objdata(), responseObj.objsize(), 1);
+    }
+
+    void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj) {
+        BufBuilder b;
+        b.skip(sizeof(QueryResult));
+        b.appendBuf((void*) obj.objdata(), obj.objsize());
+        QueryResult* msgdata = (QueryResult *) b.buf();
+        b.decouple();
+        QueryResult *qr = msgdata;
+        qr->_resultFlags() = queryResultFlags;
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->cursorId = 0;
+        qr->startingFrom = 0;
+        qr->nReturned = 1;
+        Message *resp = new Message();
+        resp->setData(msgdata, true); // transport will free
+        dbresponse.response = resp;
+        dbresponse.responseTo = m.header()->id;
+    }
+
+
+
+}
diff --git a/src/mongo/db/dbmessage.h b/src/mongo/db/dbmessage.h
new file mode 100644
index 00000000000..a789bff849c
--- /dev/null
+++ b/src/mongo/db/dbmessage.h
@@ -0,0 +1,282 @@
+// dbmessage.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "diskloc.h"
+#include "jsobj.h"
+#include "namespace-inl.h"
+#include "../util/net/message.h"
+#include "../client/constants.h"
+#include "instance.h"
+
+namespace mongo {
+
+    /* db response format
+
+       Query or GetMore: // see struct QueryResult
+          int resultFlags;
+          int64 cursorID;
+          int startingFrom;
+          int nReturned;
+          list of marshalled JSObjects;
+    */
+
+/* db request message format
+
+   unsigned opid;         // arbitary; will be echoed back
+   byte operation;
+   int options;
+
+   then for:
+
+   dbInsert:
+      string collection;
+      a series of JSObjects
+   dbDelete:
+      string collection;
+      int flags=0; // 1=DeleteSingle
+      JSObject query;
+   dbUpdate:
+      string collection;
+      int flags; // 1=upsert
+      JSObject query;
+      JSObject objectToUpdate;
+        objectToUpdate may include { $inc: <field> } or { $set: ... }, see struct Mod.
+   dbQuery:
+      string collection;
+      int nToSkip;
+      int nToReturn; // how many you want back as the beginning of the cursor data (0=no limit)
+                     // greater than zero is simply a hint on how many objects to send back per "cursor batch".
+                     // a negative number indicates a hard limit.
+      JSObject query;
+      [JSObject fieldsToReturn]
+   dbGetMore:
+      string collection; // redundant, might use for security.
+      int nToReturn;
+      int64 cursorID;
+   dbKillCursors=2007:
+      int n;
+      int64 cursorIDs[n];
+
+   Note that on Update, there is only one object, which is different
+   from insert where you can pass a list of objects to insert in the db.
+   Note that the update field layout is very similar layout to Query.
+*/
+
+
+#pragma pack(1)
+    struct QueryResult : public MsgData {
+        long long cursorId;
+        int startingFrom;
+        int nReturned;
+        const char *data() {
+            return (char *) (((int *)&nReturned)+1);
+        }
+        int resultFlags() {
+            return dataAsInt();
+        }
+        int& _resultFlags() {
+            return dataAsInt();
+        }
+        void setResultFlagsToOk() {
+            _resultFlags() = ResultFlag_AwaitCapable;
+        }
+        void initializeResultFlags() {
+            _resultFlags() = 0;   
+        }
+    };
+
+#pragma pack()
+
+    /* For the database/server protocol, these objects and functions encapsulate
+       the various messages transmitted over the connection.
+
+       See http://www.mongodb.org/display/DOCS/Mongo+Wire+Protocol
+    */
+    class DbMessage {
+    public:
+        DbMessage(const Message& _m) : m(_m) , mark(0) {
+            // for received messages, Message has only one buffer
+            theEnd = _m.singleData()->_data + _m.header()->dataLen();
+            char *r = _m.singleData()->_data;
+            reserved = (int *) r;
+            data = r + 4;
+            nextjsobj = data;
+        }
+
+        /** the 32 bit field before the ns 
+         * track all bit usage here as its cross op
+         * 0: InsertOption_ContinueOnError
+         * 1: fromWriteback
+         */
+        int& reservedField() { return *reserved; }
+
+        const char * getns() const {
+            return data;
+        }
+        void getns(Namespace& ns) const {
+            ns = data;
+        }
+
+        const char * afterNS() const {
+            return data + strlen( data ) + 1;
+        }
+
+        int getInt( int num ) const {
+            const int * foo = (const int*)afterNS();
+            return foo[num];
+        }
+
+        int getQueryNToReturn() const {
+            return getInt( 1 );
+        }
+
+        /**
+         * get an int64 at specified offsetBytes after ns
+         */
+        long long getInt64( int offsetBytes ) const {
+            const char * x = afterNS();
+            x += offsetBytes;
+            const long long * ll = (const long long*)x;
+            return ll[0];
+        }
+
+        void resetPull() { nextjsobj = data; }
+        int pullInt() const { return pullInt(); }
+        int& pullInt() {
+            if ( nextjsobj == data )
+                nextjsobj += strlen(data) + 1; // skip namespace
+            int& i = *((int *)nextjsobj);
+            nextjsobj += 4;
+            return i;
+        }
+        long long pullInt64() const {
+            return pullInt64();
+        }
+        long long &pullInt64() {
+            if ( nextjsobj == data )
+                nextjsobj += strlen(data) + 1; // skip namespace
+            long long &i = *((long long *)nextjsobj);
+            nextjsobj += 8;
+            return i;
+        }
+
+        OID* getOID() const {
+            return (OID *) (data + strlen(data) + 1); // skip namespace
+        }
+
+        void getQueryStuff(const char *&query, int& ntoreturn) {
+            int *i = (int *) (data + strlen(data) + 1);
+            ntoreturn = *i;
+            i++;
+            query = (const char *) i;
+        }
+
+        /* for insert and update msgs */
+        bool moreJSObjs() const {
+            return nextjsobj != 0;
+        }
+        BSONObj nextJsObj() {
+            if ( nextjsobj == data ) {
+                nextjsobj += strlen(data) + 1; // skip namespace
+                massert( 13066 ,  "Message contains no documents", theEnd > nextjsobj );
+            }
+            massert( 10304 ,  "Client Error: Remaining data too small for BSON object", theEnd - nextjsobj > 3 );
+            BSONObj js(nextjsobj);
+            massert( 10305 ,  "Client Error: Invalid object size", js.objsize() > 3 );
+            massert( 10306 ,  "Client Error: Next object larger than space left in message",
+                     js.objsize() < ( theEnd - data ) );
+            if ( cmdLine.objcheck && !js.valid() ) {
+                massert( 10307 , "Client Error: bad object in message", false);
+            }
+            nextjsobj += js.objsize();
+            if ( nextjsobj >= theEnd )
+                nextjsobj = 0;
+            return js;
+        }
+
+        const Message& msg() const { return m; }
+
+        void markSet() {
+            mark = nextjsobj;
+        }
+
+        void markReset() {
+            assert( mark );
+            nextjsobj = mark;
+        }
+
+    private:
+        const Message& m;
+        int* reserved;
+        const char *data;
+        const char *nextjsobj;
+        const char *theEnd;
+
+        const char * mark;
+
+    public:
+        enum ReservedOptions {
+            Reserved_InsertOption_ContinueOnError = 1 << 0 , 
+            Reserved_FromWriteback = 1 << 1 
+        };
+    };
+
+
+    /* a request to run a query, received from the database */
+    class QueryMessage {
+    public:
+        const char *ns;
+        int ntoskip;
+        int ntoreturn;
+        int queryOptions;
+        BSONObj query;
+        BSONObj fields;
+
+        /* parses the message into the above fields */
+        QueryMessage(DbMessage& d) {
+            ns = d.getns();
+            ntoskip = d.pullInt();
+            ntoreturn = d.pullInt();
+            query = d.nextJsObj();
+            if ( d.moreJSObjs() ) {
+                fields = d.nextJsObj();
+            }
+            queryOptions = d.msg().header()->dataAsInt();
+        }
+    };
+
+    void replyToQuery(int queryResultFlags,
+                      AbstractMessagingPort* p, Message& requestMsg,
+                      void *data, int size,
+                      int nReturned, int startingFrom = 0,
+                      long long cursorId = 0
+                      );
+
+
+    /* object reply helper. */
+    void replyToQuery(int queryResultFlags,
+                      AbstractMessagingPort* p, Message& requestMsg,
+                      BSONObj& responseObj);
+
+    /* helper to do a reply using a DbResponse object */
+    void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj);
+
+
+} // namespace mongo
diff --git a/src/mongo/db/dbwebserver.cpp b/src/mongo/db/dbwebserver.cpp
new file mode 100644
index 00000000000..eb19ba3be6c
--- /dev/null
+++ b/src/mongo/db/dbwebserver.cpp
@@ -0,0 +1,539 @@
+/* dbwebserver.cpp
+
+   This is the administrative web page displayed on port 28017.
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../util/net/miniwebserver.h"
+#include "../util/mongoutils/html.h"
+#include "../util/md5.hpp"
+#include "db.h"
+#include "instance.h"
+#include "security.h"
+#include "stats/snapshots.h"
+#include "background.h"
+#include "commands.h"
+#include "../util/version.h"
+#include "../util/ramlog.h"
+#include "pcrecpp.h"
+#include "../util/admin_access.h"
+#include "dbwebserver.h"
+#include <boost/date_time/posix_time/posix_time.hpp>
+#undef assert
+#define assert MONGO_assert
+
+namespace mongo {
+
+    using namespace mongoutils::html;
+    using namespace bson;
+
+    time_t started = time(0);
+
+    struct Timing {
+        Timing() {
+            start = timeLocked = 0;
+        }
+        unsigned long long start, timeLocked;
+    };
+
+    bool execCommand( Command * c ,
+                      Client& client , int queryOptions ,
+                      const char *ns, BSONObj& cmdObj ,
+                      BSONObjBuilder& result,
+                      bool fromRepl );
+
+    class DbWebServer : public MiniWebServer {
+    public:
+        DbWebServer(const string& ip, int port, const AdminAccess* webUsers)
+            : MiniWebServer("admin web console", ip, port), _webUsers(webUsers) {
+            WebStatusPlugin::initAll();
+        }
+
+    private:
+        const AdminAccess* _webUsers; // not owned here
+
+        void doUnlockedStuff(stringstream& ss) {
+            /* this is in the header already ss << "port:      " << port << '\n'; */
+            ss << "<pre>";
+            ss << mongodVersion() << '\n';
+            ss << "git hash: " << gitVersion() << '\n';
+            ss << "sys info: " << sysInfo() << '\n';
+            ss << "uptime: " << time(0)-started << " seconds\n";
+            ss << "</pre>";
+        }
+
+        bool allowed( const char * rq , vector<string>& headers, const SockAddr &from ) {
+            if ( from.isLocalHost() || !_webUsers->haveAdminUsers() ) {
+                cmdAuthenticate.authenticate( "admin", "RestUser", false );
+                return true;
+            }
+
+            string auth = getHeader( rq , "Authorization" );
+
+            if ( auth.size() > 0 && auth.find( "Digest " ) == 0 ) {
+                auth = auth.substr( 7 ) + ", ";
+
+                map<string,string> parms;
+                pcrecpp::StringPiece input( auth );
+
+                string name, val;
+                pcrecpp::RE re("(\\w+)=\"?(.*?)\"?, ");
+                while ( re.Consume( &input, &name, &val) ) {
+                    parms[name] = val;
+                }
+
+                BSONObj user = _webUsers->getAdminUser( parms["username"] );
+                if ( ! user.isEmpty() ) {
+                    string ha1 = user["pwd"].str();
+                    string ha2 = md5simpledigest( (string)"GET" + ":" + parms["uri"] );
+
+                    stringstream r;
+                    r << ha1 << ':' << parms["nonce"];
+                    if ( parms["nc"].size() && parms["cnonce"].size() && parms["qop"].size() ) {
+                        r << ':';
+                        r << parms["nc"];
+                        r << ':';
+                        r << parms["cnonce"];
+                        r << ':';
+                        r << parms["qop"];
+                    }
+                    r << ':';
+                    r << ha2;
+                    string r1 = md5simpledigest( r.str() );
+
+                    if ( r1 == parms["response"] ) {
+                        cmdAuthenticate.authenticate( "admin", user["user"].str(), user[ "readOnly" ].isBoolean() && user[ "readOnly" ].boolean() );
+                        return true;
+                    }
+                }
+            }
+
+            stringstream authHeader;
+            authHeader
+                    << "WWW-Authenticate: "
+                    << "Digest realm=\"mongo\", "
+                    << "nonce=\"abc\", "
+                    << "algorithm=MD5, qop=\"auth\" "
+                    ;
+
+            headers.push_back( authHeader.str() );
+            return 0;
+        }
+
+        virtual void doRequest(
+            const char *rq, // the full request
+            string url,
+            // set these and return them:
+            string& responseMsg,
+            int& responseCode,
+            vector<string>& headers, // if completely empty, content-type: text/html will be added
+            const SockAddr &from
+        ) {
+            if ( url.size() > 1 ) {
+
+                if ( ! allowed( rq , headers, from ) ) {
+                    responseCode = 401;
+                    headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+                    responseMsg = "not allowed\n";
+                    return;
+                }
+
+                {
+                    BSONObj params;
+                    const size_t pos = url.find( "?" );
+                    if ( pos != string::npos ) {
+                        MiniWebServer::parseParams( params , url.substr( pos + 1 ) );
+                        url = url.substr(0, pos);
+                    }
+
+                    DbWebHandler * handler = DbWebHandler::findHandler( url );
+                    if ( handler ) {
+                        if ( handler->requiresREST( url ) && ! cmdLine.rest ) {
+                            _rejectREST( responseMsg , responseCode , headers );
+                        }
+                        else {
+                            string callback = params.getStringField("jsonp");
+                            uassert(13453, "server not started with --jsonp", callback.empty() || cmdLine.jsonp);
+
+                            handler->handle( rq , url , params , responseMsg , responseCode , headers , from );
+
+                            if (responseCode == 200 && !callback.empty()) {
+                                responseMsg = callback + '(' + responseMsg + ')';
+                            }
+                        }
+                        return;
+                    }
+                }
+
+
+                if ( ! cmdLine.rest ) {
+                    _rejectREST( responseMsg , responseCode , headers );
+                    return;
+                }
+
+                responseCode = 404;
+                headers.push_back( "Content-Type: text/html;charset=utf-8" );
+                responseMsg = "<html><body>unknown url</body></html>\n";
+                return;
+            }
+
+            // generate home page
+
+            if ( ! allowed( rq , headers, from ) ) {
+                responseCode = 401;
+                headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+                responseMsg = "not allowed\n";
+                return;
+            }
+
+            responseCode = 200;
+            stringstream ss;
+            string dbname;
+            {
+                stringstream z;
+                z << cmdLine.binaryName << ' ' << prettyHostName();
+                dbname = z.str();
+            }
+            ss << start(dbname) << h2(dbname);
+            ss << "<p><a href=\"/_commands\">List all commands</a> | \n";
+            ss << "<a href=\"/_replSet\">Replica set status</a></p>\n";
+
+            //ss << "<a href=\"/_status\">_status</a>";
+            {
+                const map<string, Command*> *m = Command::webCommands();
+                if( m ) {
+                    ss <<
+                       a("",
+                         "These read-only context-less commands can be executed from the web interface. "
+                         "Results are json format, unless ?text=1 is appended in which case the result is output as text "
+                         "for easier human viewing",
+                         "Commands")
+                       << ": ";
+                    for( map<string, Command*>::const_iterator i = m->begin(); i != m->end(); i++ ) {
+                        stringstream h;
+                        i->second->help(h);
+                        string help = h.str();
+                        ss << "<a href=\"/" << i->first << "?text=1\"";
+                        if( help != "no help defined" )
+                            ss << " title=\"" << help << '"';
+                        ss << ">" << i->first << "</a> ";
+                    }
+                    ss << '\n';
+                }
+            }
+            ss << '\n';
+            /*
+                ss << "HTTP <a "
+                    "title=\"click for documentation on this http interface\""
+                    "href=\"http://www.mongodb.org/display/DOCS/Http+Interface\">admin port</a>:" << _port << "<p>\n";
+            */
+
+            doUnlockedStuff(ss);
+
+            WebStatusPlugin::runAll( ss );
+
+            ss << "</body></html>\n";
+            responseMsg = ss.str();
+            headers.push_back( "Content-Type: text/html;charset=utf-8" );
+        }
+
+        void _rejectREST( string& responseMsg , int& responseCode, vector<string>& headers ) {
+            responseCode = 403;
+            stringstream ss;
+            ss << "REST is not enabled.  use --rest to turn on.\n";
+            ss << "check that port " << _port << " is secured for the network too.\n";
+            responseMsg = ss.str();
+            headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+        }
+
+    };
+    // ---
+
+    bool prisort( const Prioritizable * a , const Prioritizable * b ) {
+        return a->priority() < b->priority();
+    }
+
+    // -- status framework ---
+    WebStatusPlugin::WebStatusPlugin( const string& secionName , double priority , const string& subheader )
+        : Prioritizable(priority), _name( secionName ) , _subHeading( subheader ) {
+        if ( ! _plugins )
+            _plugins = new vector<WebStatusPlugin*>();
+        _plugins->push_back( this );
+    }
+
+    void WebStatusPlugin::initAll() {
+        if ( ! _plugins )
+            return;
+
+        sort( _plugins->begin(), _plugins->end() , prisort );
+
+        for ( unsigned i=0; i<_plugins->size(); i++ )
+            (*_plugins)[i]->init();
+    }
+
+    void WebStatusPlugin::runAll( stringstream& ss ) {
+        if ( ! _plugins )
+            return;
+
+        for ( unsigned i=0; i<_plugins->size(); i++ ) {
+            WebStatusPlugin * p = (*_plugins)[i];
+            ss << "<hr>\n"
+               << "<b>" << p->_name << "</b>";
+
+            ss << " " << p->_subHeading;
+
+            ss << "<br>\n";
+
+            p->run(ss);
+        }
+
+    }
+
+    vector<WebStatusPlugin*> * WebStatusPlugin::_plugins = 0;
+
+    // -- basic statuc plugins --
+
+    class LogPlugin : public WebStatusPlugin {
+    public:
+        LogPlugin() : WebStatusPlugin( "Log" , 100 ), _log(0) {
+        }
+
+        virtual void init() {
+            _log = RamLog::get( "global" );
+            if ( ! _log ) {
+                _log = new RamLog("global");
+                Logstream::get().addGlobalTee( _log );
+            }
+        }
+
+        virtual void run( stringstream& ss ) {
+            _log->toHTML( ss );
+        }
+        RamLog * _log;
+    };
+
+    LogPlugin * logPlugin = new LogPlugin();
+
+    // -- handler framework ---
+
+    DbWebHandler::DbWebHandler( const string& name , double priority , bool requiresREST )
+        : Prioritizable(priority), _name(name) , _requiresREST(requiresREST) {
+
+        {
+            // setup strings
+            _defaultUrl = "/";
+            _defaultUrl += name;
+
+            stringstream ss;
+            ss << name << " priority: " << priority << " rest: " << requiresREST;
+            _toString = ss.str();
+        }
+
+        {
+            // add to handler list
+            if ( ! _handlers )
+                _handlers = new vector<DbWebHandler*>();
+            _handlers->push_back( this );
+            sort( _handlers->begin() , _handlers->end() , prisort );
+        }
+    }
+
+    DbWebHandler * DbWebHandler::findHandler( const string& url ) {
+        if ( ! _handlers )
+            return 0;
+
+        for ( unsigned i=0; i<_handlers->size(); i++ ) {
+            DbWebHandler * h = (*_handlers)[i];
+            if ( h->handles( url ) )
+                return h;
+        }
+
+        return 0;
+    }
+
+    vector<DbWebHandler*> * DbWebHandler::_handlers = 0;
+
+    // --- basic handlers ---
+
+    class FavIconHandler : public DbWebHandler {
+    public:
+        FavIconHandler() : DbWebHandler( "favicon.ico" , 0 , false ) {}
+
+        virtual void handle( const char *rq, string url, BSONObj params,
+                             string& responseMsg, int& responseCode,
+                             vector<string>& headers,  const SockAddr &from ) {
+            responseCode = 404;
+            headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+            responseMsg = "no favicon\n";
+        }
+
+    } faviconHandler;
+
+    class StatusHandler : public DbWebHandler {
+    public:
+        StatusHandler() : DbWebHandler( "_status" , 1 , false ) {}
+
+        virtual void handle( const char *rq, string url, BSONObj params,
+                             string& responseMsg, int& responseCode,
+                             vector<string>& headers,  const SockAddr &from ) {
+            headers.push_back( "Content-Type: application/json;charset=utf-8" );
+            responseCode = 200;
+
+            static vector<string> commands;
+            if ( commands.size() == 0 ) {
+                commands.push_back( "serverStatus" );
+                commands.push_back( "buildinfo" );
+            }
+
+            BSONObjBuilder buf(1024);
+
+            for ( unsigned i=0; i<commands.size(); i++ ) {
+                string cmd = commands[i];
+
+                Command * c = Command::findCommand( cmd );
+                assert( c );
+                assert( c->locktype() == 0 );
+
+                BSONObj co;
+                {
+                    BSONObjBuilder b;
+                    b.append( cmd , 1 );
+
+                    if ( cmd == "serverStatus" && params["repl"].type() ) {
+                        b.append( "repl" , atoi( params["repl"].valuestr() ) );
+                    }
+
+                    co = b.obj();
+                }
+
+                string errmsg;
+
+                BSONObjBuilder sub;
+                if ( ! c->run( "admin.$cmd" , co , 0, errmsg , sub , false ) )
+                    buf.append( cmd , errmsg );
+                else
+                    buf.append( cmd , sub.obj() );
+            }
+
+            responseMsg = buf.obj().jsonString();
+
+        }
+
+    } statusHandler;
+
+    class CommandListHandler : public DbWebHandler {
+    public:
+        CommandListHandler() : DbWebHandler( "_commands" , 1 , true ) {}
+
+        virtual void handle( const char *rq, string url, BSONObj params,
+                             string& responseMsg, int& responseCode,
+                             vector<string>& headers,  const SockAddr &from ) {
+            headers.push_back( "Content-Type: text/html;charset=utf-8" );
+            responseCode = 200;
+
+            stringstream ss;
+            ss << start("Commands List");
+            ss << p( a("/", "back", "Home") );
+            ss << p( "<b>MongoDB List of <a href=\"http://www.mongodb.org/display/DOCS/Commands\">Commands</a></b>\n" );
+            const map<string, Command*> *m = Command::commandsByBestName();
+            ss << "S:slave-ok  R:read-lock  W:write-lock  A:admin-only<br>\n";
+            ss << table();
+            ss << "<tr><th>Command</th><th>Attributes</th><th>Help</th></tr>\n";
+            for( map<string, Command*>::const_iterator i = m->begin(); i != m->end(); i++ )
+                i->second->htmlHelp(ss);
+            ss << _table() << _end();
+
+            responseMsg = ss.str();
+        }
+    } commandListHandler;
+
+    class CommandsHandler : public DbWebHandler {
+    public:
+        CommandsHandler() : DbWebHandler( "DUMMY COMMANDS" , 2 , true ) {}
+
+        bool _cmd( const string& url , string& cmd , bool& text, bo params ) const {
+            cmd = str::after(url, '/');
+            text = params["text"].boolean();
+            return true;
+        }
+
+        Command * _cmd( const string& cmd ) const {
+            const map<string,Command*> *m = Command::webCommands();
+            if( ! m )
+                return 0;
+
+            map<string,Command*>::const_iterator i = m->find(cmd);
+            if ( i == m->end() )
+                return 0;
+
+            return i->second;
+        }
+
+        virtual bool handles( const string& url ) const {
+            string cmd;
+            bool text;
+            if ( ! _cmd( url , cmd , text, bo() ) )
+                return false;
+            return _cmd(cmd) != 0;
+        }
+
+        virtual void handle( const char *rq, string url, BSONObj params,
+                             string& responseMsg, int& responseCode,
+                             vector<string>& headers,  const SockAddr &from ) {
+            string cmd;
+            bool text = false;
+            assert( _cmd( url , cmd , text, params ) );
+            Command * c = _cmd( cmd );
+            assert( c );
+
+            BSONObj cmdObj = BSON( cmd << 1 );
+            Client& client = cc();
+
+            BSONObjBuilder result;
+            execCommand(c, client, 0, "admin.", cmdObj , result, false);
+
+            responseCode = 200;
+
+            string j = result.done().jsonString(Strict, text );
+            responseMsg = j;
+
+            if( text ) {
+                headers.push_back( "Content-Type: text/plain;charset=utf-8" );
+                responseMsg += '\n';
+            }
+            else {
+                headers.push_back( "Content-Type: application/json;charset=utf-8" );
+            }
+
+        }
+
+    } commandsHandler;
+
+    // --- external ----
+
+    void webServerThread(const AdminAccess* adminAccess) {
+        boost::scoped_ptr<const AdminAccess> adminAccessPtr(adminAccess); // adminAccess is owned here
+        Client::initThread("websvr");
+        const int p = cmdLine.port + 1000;
+        DbWebServer mini(cmdLine.bind_ip, p, adminAccessPtr.get());
+        mini.initAndListen();
+        cc().shutdown();
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/dbwebserver.h b/src/mongo/db/dbwebserver.h
new file mode 100644
index 00000000000..bdbcba2c07d
--- /dev/null
+++ b/src/mongo/db/dbwebserver.h
@@ -0,0 +1,85 @@
+/** @file dbwebserver.h
+ */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "../util/admin_access.h"
+
+namespace mongo {
+
+    class Prioritizable {
+    public:
+        Prioritizable( double p ) : _priority(p) {}
+        double priority() const { return _priority; }
+    private:
+        double _priority;
+    };
+
+    class DbWebHandler : public Prioritizable {
+    public:
+        DbWebHandler( const string& name , double priority , bool requiresREST );
+        virtual ~DbWebHandler() {}
+
+        virtual bool handles( const string& url ) const { return url == _defaultUrl; }
+
+        virtual bool requiresREST( const string& url ) const { return _requiresREST; }
+
+        virtual void handle( const char *rq, // the full request
+                             string url,
+                             BSONObj params,
+                             // set these and return them:
+                             string& responseMsg,
+                             int& responseCode,
+                             vector<string>& headers, // if completely empty, content-type: text/html will be added
+                             const SockAddr &from
+                           ) = 0;
+
+        string toString() const { return _toString; }
+        static DbWebHandler * findHandler( const string& url );
+
+    private:
+        string _name;
+        bool _requiresREST;
+
+        string _defaultUrl;
+        string _toString;
+
+        static vector<DbWebHandler*> * _handlers;
+    };
+
+    class WebStatusPlugin : public Prioritizable {
+    public:
+        WebStatusPlugin( const string& secionName , double priority , const string& subheader = "" );
+        virtual ~WebStatusPlugin() {}
+
+        virtual void run( stringstream& ss ) = 0;
+        /** called when web server stats up */
+        virtual void init() = 0;
+
+        static void initAll();
+        static void runAll( stringstream& ss );
+    private:
+        string _name;
+        string _subHeading;
+        static vector<WebStatusPlugin*> * _plugins;
+
+    };
+
+    void webServerThread( const AdminAccess* admins );
+    string prettyHostName();
+
+};
diff --git a/src/mongo/db/diskloc.h b/src/mongo/db/diskloc.h
new file mode 100644
index 00000000000..5295df3e260
--- /dev/null
+++ b/src/mongo/db/diskloc.h
@@ -0,0 +1,160 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* @file diskloc.h
+
+   Storage subsystem management.
+   Lays out our datafiles on disk, manages disk space.
+*/
+
+#pragma once
+
+#include "jsobj.h"
+
+namespace mongo {
+
+    class Record;
+    class DeletedRecord;
+    class Extent;
+    class MongoDataFile;
+    class DiskLoc;
+
+    template< class Version > class BtreeBucket;
+
+#pragma pack(1)
+    /** represents a disk location/offset on disk in a database.  64 bits.
+        it is assumed these will be passed around by value a lot so don't do anything to make them large
+        (such as adding a virtual function)
+     */
+    class DiskLoc {
+        int _a;     // this will be volume, file #, etsc. but is a logical value could be anything depending on storage engine
+        int ofs;
+
+    public:
+
+        enum SentinelValues {
+            /* note NullOfs is different. todo clean up.  see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */
+            NullOfs = -1,
+            MaxFiles=16000 // thus a limit of about 32TB of data per db
+        };
+
+        DiskLoc(int a, int Ofs) : _a(a), ofs(Ofs) { }
+        DiskLoc() { Null(); }
+        DiskLoc(const DiskLoc& l) {
+            _a=l._a;
+            ofs=l.ofs;
+        }
+
+        bool questionable() const {
+            return ofs < -1 ||
+                   _a < -1 ||
+                   _a > 524288;
+        }
+
+        bool isNull() const { return _a == -1; }
+        void Null() {
+            _a = -1;
+            ofs = 0; /* note NullOfs is different. todo clean up.  see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */
+        }
+        void assertOk() { assert(!isNull()); }
+        void setInvalid() {
+            _a = -2;
+            ofs = 0;
+        }
+        bool isValid() const { return _a != -2; }
+
+        string toString() const {
+            if ( isNull() )
+                return "null";
+            stringstream ss;
+            ss << hex << _a << ':' << ofs;
+            return ss.str();
+        }
+
+        BSONObj toBSONObj() const { return BSON( "file" << _a << "offset" << ofs );  }
+
+        int a() const { return _a; }
+
+        int& GETOFS()      { return ofs; }
+        int getOfs() const { return ofs; }
+        void set(int a, int b) {
+            _a=a;
+            ofs=b;
+        }
+
+        void inc(int amt) {
+            assert( !isNull() );
+            ofs += amt;
+        }
+
+        bool sameFile(DiskLoc b) {
+            return _a== b._a;
+        }
+
+        bool operator==(const DiskLoc& b) const {
+            return _a==b._a&& ofs == b.ofs;
+        }
+        bool operator!=(const DiskLoc& b) const {
+            return !(*this==b);
+        }
+        const DiskLoc& operator=(const DiskLoc& b) {
+            _a=b._a;
+            ofs = b.ofs;
+            //assert(ofs!=0);
+            return *this;
+        }
+        int compare(const DiskLoc& b) const {
+            int x = _a - b._a;
+            if ( x )
+                return x;
+            return ofs - b.ofs;
+        }
+        bool operator<(const DiskLoc& b) const {
+            return compare(b) < 0;
+        }
+
+        /**
+         * Marks this disk loc for writing
+         * @returns a non const reference to this disk loc
+         * This function explicitly signals we are writing and casts away const
+         */
+        DiskLoc& writing() const; // see dur.h
+
+        /* Get the "thing" associated with this disk location.
+           it is assumed the object is what you say it is -- you must assure that
+           (think of this as an unchecked type cast)
+           Note: set your Context first so that the database to which the diskloc applies is known.
+        */
+        BSONObj obj() const;
+        Record* rec() const;
+        DeletedRecord* drec() const;
+        Extent* ext() const;
+
+        template< class V >
+        const BtreeBucket<V> * btree() const;
+
+        // Explicitly signals we are writing and casts away const
+        template< class V >
+        BtreeBucket<V> * btreemod() const;
+
+        /*MongoDataFile& pdf() const;*/
+    };
+#pragma pack()
+
+    const DiskLoc minDiskLoc(0, 1);
+    const DiskLoc maxDiskLoc(0x7fffffff, 0x7fffffff);
+
+} // namespace mongo
diff --git a/src/mongo/db/driverHelpers.cpp b/src/mongo/db/driverHelpers.cpp
new file mode 100644
index 00000000000..12aa01886c4
--- /dev/null
+++ b/src/mongo/db/driverHelpers.cpp
@@ -0,0 +1,62 @@
+// driverHelpers.cpp
+
+/**
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+   this file has dbcommands that are for drivers
+   mostly helpers
+*/
+
+
+#include "pch.h"
+#include "jsobj.h"
+#include "pdfile.h"
+#include "namespace-inl.h"
+#include "commands.h"
+#include "cmdline.h"
+#include "btree.h"
+#include "curop-inl.h"
+#include "../util/background.h"
+#include "../scripting/engine.h"
+
+namespace mongo {
+
+    class BasicDriverHelper : public Command {
+    public:
+        BasicDriverHelper( const char * name ) : Command( name ) {}
+
+        virtual LockType locktype() const { return NONE; }
+        virtual bool slaveOk() const { return true; }
+        virtual bool slaveOverrideOk() { return true; }
+    };
+
+    class ObjectIdTest : public BasicDriverHelper {
+    public:
+        ObjectIdTest() : BasicDriverHelper( "driverOIDTest" ) {}
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( cmdObj.firstElement().type() != jstOID ) {
+                errmsg = "not oid";
+                return false;
+            }
+
+            const OID& oid = cmdObj.firstElement().__oid();
+            result.append( "oid" , oid );
+            result.append( "str" , oid.str() );
+
+            return true;
+        }
+    } driverObjectIdTest;
+}
diff --git a/src/mongo/db/dur.cpp b/src/mongo/db/dur.cpp
new file mode 100644
index 00000000000..822fa5232c0
--- /dev/null
+++ b/src/mongo/db/dur.cpp
@@ -0,0 +1,840 @@
+// @file dur.cpp durability in the storage engine (crash-safeness / journaling)
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+   phases:
+
+     PREPLOGBUFFER
+       we will build an output buffer ourself and then use O_DIRECT
+       we could be in read lock for this
+       for very large objects write directly to redo log in situ?
+     WRITETOJOURNAL
+       we could be unlocked (the main db lock that is...) for this, with sufficient care, but there is some complexity
+         have to handle falling behind which would use too much ram (going back into a read lock would suffice to stop that).
+         for now (1.7.5/1.8.0) we are in read lock which is not ideal.
+     WRITETODATAFILES
+       apply the writes back to the non-private MMF after they are for certain in redo log
+     REMAPPRIVATEVIEW
+       we could in a write lock quickly flip readers back to the main view, then stay in read lock and do our real
+         remapping. with many files (e.g., 1000), remapping could be time consuming (several ms), so we don't want
+         to be too frequent.
+       there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will
+         be required.  so doing these remaps fractionally is helpful. 
+
+   mutexes:
+
+     READLOCK dbMutex
+     LOCK groupCommitMutex
+       PREPLOGBUFFER()
+     READLOCK mmmutex
+       commitJob.reset()
+     UNLOCK dbMutex                                     // now other threads can write
+       WRITETOJOURNAL()
+       WRITETODATAFILES()
+     UNLOCK mmmutex
+     UNLOCK groupCommitMutex
+
+     on the next write lock acquisition for dbMutex:    // see MongoMutex::_acquiredWriteLock()
+       REMAPPRIVATEVIEW()
+
+     @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "client.h"
+#include "dur.h"
+#include "dur_journal.h"
+#include "dur_commitjob.h"
+#include "dur_recover.h"
+#include "dur_stats.h"
+#include "../util/concurrency/race.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/mongoutils/str.h"
+#include "../util/timer.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    namespace dur {
+
+        void PREPLOGBUFFER(JSectHeader& outParm);
+        void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed);
+        void WRITETODATAFILES(const JSectHeader& h, AlignedBuilder& uncompressed);
+
+        /** declared later in this file
+            only used in this file -- use DurableInterface::commitNow() outside
+        */
+        static void groupCommit();
+
+        CommitJob& commitJob = *(new CommitJob()); // don't destroy
+
+        Stats stats;
+
+        void Stats::S::reset() {
+            memset(this, 0, sizeof(*this));
+        }
+
+        Stats::Stats() {
+            _a.reset();
+            _b.reset();
+            curr = &_a;
+            _intervalMicros = 3000000;
+        }
+
+        Stats::S * Stats::other() {
+            return curr == &_a ? &_b : &_a;
+        }
+                        string _CSVHeader();
+
+        string Stats::S::_CSVHeader() { 
+            return "cmts  jrnMB\twrDFMB\tcIWLk\tearly\tprpLgB  wrToJ\twrToDF\trmpPrVw";
+        }
+
+        string Stats::S::_asCSV() { 
+            stringstream ss;
+            ss << 
+                setprecision(2) << 
+                _commits << '\t' << fixed << 
+                _journaledBytes / 1000000.0 << '\t' << 
+                _writeToDataFilesBytes / 1000000.0 << '\t' << 
+                _commitsInWriteLock << '\t' << 
+                _earlyCommits <<  '\t' << 
+                (unsigned) (_prepLogBufferMicros/1000) << '\t' << 
+                (unsigned) (_writeToJournalMicros/1000) << '\t' << 
+                (unsigned) (_writeToDataFilesMicros/1000) << '\t' << 
+                (unsigned) (_remapPrivateViewMicros/1000);
+            return ss.str();
+        }
+
+        //int getAgeOutJournalFiles();
+        BSONObj Stats::S::_asObj() {
+            BSONObjBuilder b;
+            b << 
+                       "commits" << _commits <<
+                       "journaledMB" << _journaledBytes / 1000000.0 <<
+                       "writeToDataFilesMB" << _writeToDataFilesBytes / 1000000.0 <<
+                       "compression" << _journaledBytes / (_uncompressedBytes+1.0) <<
+                       "commitsInWriteLock" << _commitsInWriteLock <<
+                       "earlyCommits" << _earlyCommits << 
+                       "timeMs" <<
+                       BSON( "dt" << _dtMillis <<
+                             "prepLogBuffer" << (unsigned) (_prepLogBufferMicros/1000) <<
+                             "writeToJournal" << (unsigned) (_writeToJournalMicros/1000) <<
+                             "writeToDataFiles" << (unsigned) (_writeToDataFilesMicros/1000) <<
+                             "remapPrivateView" << (unsigned) (_remapPrivateViewMicros/1000)
+                           );
+            /*int r = getAgeOutJournalFiles();
+            if( r == -1 )
+                b << "ageOutJournalFiles" << "mutex timeout";
+            if( r == 0 )
+                b << "ageOutJournalFiles" << false;*/
+            if( cmdLine.journalCommitInterval != 0 )
+                b << "journalCommitIntervalMs" << cmdLine.journalCommitInterval;
+            return b.obj();
+        }
+
+        BSONObj Stats::asObj() {
+            return other()->_asObj();
+        }
+
+        void Stats::rotate() {
+            unsigned long long now = curTimeMicros64();
+            unsigned long long dt = now - _lastRotate;
+            if( dt >= _intervalMicros && _intervalMicros ) {
+                // rotate
+                curr->_dtMillis = (unsigned) (dt/1000);
+                _lastRotate = now;
+                curr = other();
+                curr->reset();
+            }
+        }
+
+        void NonDurableImpl::setNoJournal(void *dst, void *src, unsigned len) {
+            memcpy(dst, src, len);
+        }
+
+        void DurableImpl::setNoJournal(void *dst, void *src, unsigned len) {
+            // we are at least read locked, so we need not worry about REMAPPRIVATEVIEW herein.
+            DEV d.dbMutex.assertAtLeastReadLocked();
+
+            MemoryMappedFile::makeWritable(dst, len);
+
+            // we enter the RecoveryJob mutex here, so that if WRITETODATAFILES is happening we do not 
+            // conflict with it
+            scoped_lock lk1( RecoveryJob::get()._mx );
+
+            // we stay in this mutex for everything to work with DurParanoid/validateSingleMapMatches
+            //
+            // either of these mutexes also makes setNoJournal threadsafe, which is good as we call it from a read 
+            // (not a write) lock in class SlaveTracking
+            //
+            scoped_lock lk( privateViews._mutex() );
+
+            size_t ofs;
+            MongoMMF *f = privateViews.find_inlock(dst, ofs);
+            assert(f);
+            void *w = (((char *)f->view_write())+ofs);
+            // first write it to the writable (file) view
+            memcpy(w, src, len);
+            if( memcmp(w, dst, len) ) {
+                // if we get here, a copy-on-write had previously occurred. so write it to the private view too
+                // to keep them in sync.  we do this as we do not want to cause a copy on write unnecessarily.
+                memcpy(dst, src, len);
+            }
+        }
+
+        /** base declare write intent function that all the helpers call. */
+        void DurableImpl::declareWriteIntent(void *p, unsigned len) {
+            commitJob.note(p, len);
+        }
+
+        static DurableImpl* durableImpl = new DurableImpl();
+        static NonDurableImpl* nonDurableImpl = new NonDurableImpl();
+        DurableInterface* DurableInterface::_impl = nonDurableImpl;
+
+        void DurableInterface::enableDurability() {
+            assert(_impl == nonDurableImpl);
+            _impl = durableImpl;
+        }
+
+        void DurableInterface::disableDurability() {
+            assert(_impl == durableImpl);
+            massert(13616, "can't disable durability with pending writes", !commitJob.hasWritten());
+            _impl = nonDurableImpl;
+        }
+
+        bool DurableImpl::commitNow() {
+            stats.curr->_earlyCommits++;
+            groupCommit();
+            return true;
+        }
+
+        bool DurableImpl::awaitCommit() {
+            commitJob._notify.awaitBeyondNow();
+            return true;
+        }
+
+        /** Declare that a file has been created
+            Normally writes are applied only after journaling, for safety.  But here the file
+            is created first, and the journal will just replay the creation if the create didn't
+            happen because of crashing.
+        */
+        void DurableImpl::createdFile(string filename, unsigned long long len) {
+            shared_ptr<DurOp> op( new FileCreatedOp(filename, len) );
+            commitJob.noteOp(op);
+        }
+
+        void* DurableImpl::writingPtr(void *x, unsigned len) {
+            void *p = x;
+            declareWriteIntent(p, len);
+            return p;
+        }
+
+        /** declare intent to write
+            @param ofs offset within buf at which we will write
+            @param len the length at ofs we will write
+            @return new buffer pointer.
+        */
+        void* DurableImpl::writingAtOffset(void *buf, unsigned ofs, unsigned len) {
+            char *p = (char *) buf;
+            declareWriteIntent(p+ofs, len);
+            return p;
+        }
+
+        void* DurableImpl::writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges ) {
+            char *p = (char *) buf;
+            for( vector< pair< long long, unsigned > >::const_iterator i = ranges.begin();
+                    i != ranges.end(); ++i ) {
+                declareWriteIntent( p + i->first, i->second );
+            }
+            return p;
+        }
+
+        bool DurableImpl::aCommitIsNeeded() const {
+            DEV commitJob._nSinceCommitIfNeededCall = 0;
+            return commitJob.bytes() > UncommittedBytesLimit;
+        }
+
+        bool DurableImpl::commitIfNeeded() {
+            if ( !d.dbMutex.isWriteLocked() )
+                return false;
+
+            DEV commitJob._nSinceCommitIfNeededCall = 0;
+            if (commitJob.bytes() > UncommittedBytesLimit) { // should this also fire if CmdLine::DurAlwaysCommit?
+                stats.curr->_earlyCommits++;
+                groupCommit();
+                return true;
+            }
+            return false;
+        }
+
+        /** Used in _DEBUG builds to check that we didn't overwrite the last intent
+            that was declared.  called just before writelock release.  we check a few
+            bytes after the declared region to see if they changed.
+
+            @see MongoMutex::_releasedWriteLock
+
+            SLOW
+        */
+#if 0
+        void DurableImpl::debugCheckLastDeclaredWrite() {
+            static int n;
+            ++n;
+
+            assert(debug && cmdLine.dur);
+            if (commitJob.writes().empty())
+                return;
+            const WriteIntent &i = commitJob.lastWrite();
+            size_t ofs;
+            MongoMMF *mmf = privateViews.find(i.start(), ofs);
+            if( mmf == 0 )
+                return;
+            size_t past = ofs + i.length();
+            if( mmf->length() < past + 8 )
+                return; // too close to end of view
+            char *priv = (char *) mmf->getView();
+            char *writ = (char *) mmf->view_write();
+            unsigned long long *a = (unsigned long long *) (priv+past);
+            unsigned long long *b = (unsigned long long *) (writ+past);
+            if( *a != *b ) {
+                for( set<WriteIntent>::iterator it(commitJob.writes().begin()), end((commitJob.writes().begin())); it != end; ++it ) {
+                    const WriteIntent& wi = *it;
+                    char *r1 = (char*) wi.start();
+                    char *r2 = (char*) wi.end();
+                    if( r1 <= (((char*)a)+8) && r2 > (char*)a ) {
+                        //log() << "it's ok " << wi.p << ' ' << wi.len << endl;
+                        return;
+                    }
+                }
+                log() << "journal data after write area " << i.start() << " does not agree" << endl;
+                log() << " was:  " << ((void*)b) << "  " << hexdump((char*)b, 8) << endl;
+                log() << " now:  " << ((void*)a) << "  " << hexdump((char*)a, 8) << endl;
+                log() << " n:    " << n << endl;
+                log() << endl;
+            }
+        }
+#endif
+
+        // Functor to be called over all MongoFiles
+
+        class validateSingleMapMatches {
+        public:
+            validateSingleMapMatches(unsigned long long& bytes) :_bytes(bytes)  {}
+            void operator () (MongoFile *mf) {
+                if( mf->isMongoMMF() ) {
+                    MongoMMF *mmf = (MongoMMF*) mf;
+                    const unsigned char *p = (const unsigned char *) mmf->getView();
+                    const unsigned char *w = (const unsigned char *) mmf->view_write();
+
+                    if (!p || !w) return; // File not fully opened yet
+
+                    _bytes += mmf->length();
+
+                    assert( mmf->length() == (unsigned) mmf->length() );
+                    {
+                        scoped_lock lk( privateViews._mutex() ); // see setNoJournal
+                        if (memcmp(p, w, (unsigned) mmf->length()) == 0)
+                            return; // next file
+                    }
+
+                    unsigned low = 0xffffffff;
+                    unsigned high = 0;
+                    log() << "DurParanoid mismatch in " << mmf->filename() << endl;
+                    int logged = 0;
+                    unsigned lastMismatch = 0xffffffff;
+                    for( unsigned i = 0; i < mmf->length(); i++ ) {
+                        if( p[i] != w[i] ) {
+                            if( lastMismatch != 0xffffffff && lastMismatch+1 != i )
+                                log() << endl; // separate blocks of mismatches
+                            lastMismatch= i;
+                            if( ++logged < 60 ) {
+                                if( logged == 1 )
+                                    log() << "ofs % 628 = 0x" << hex << (i%628) << endl; // for .ns files to find offset in record
+                                stringstream ss;
+                                ss << "mismatch ofs:" << hex << i <<  "\tfilemap:" << setw(2) << (unsigned) w[i] << "\tprivmap:" << setw(2) << (unsigned) p[i];
+                                if( p[i] > 32 && p[i] <= 126 )
+                                    ss << '\t' << p[i];
+                                log() << ss.str() << endl;
+                            }
+                            if( logged == 60 )
+                                log() << "..." << endl;
+                            if( i < low ) low = i;
+                            if( i > high ) high = i;
+                        }
+                    }
+                    if( low != 0xffffffff ) {
+                        std::stringstream ss;
+                        ss << "journal error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1;
+                        log() << ss.str() << endl;
+                        log() << "priv loc: " << (void*)(p+low) << ' ' << endl;
+                        set<WriteIntent>& b = commitJob.writes();
+                        (void)b; // mark as unused. Useful for inspection in debugger
+
+                        // should we abort() here so this isn't unnoticed in some circumstances?
+                        massert(13599, "Written data does not match in-memory view. Missing WriteIntent?", false);
+                    }
+                }
+            }
+        private:
+            unsigned long long& _bytes;
+        };
+
+        /** (SLOW) diagnostic to check that the private view and the non-private view are in sync.
+        */
+        void debugValidateAllMapsMatch() {
+            if( ! (cmdLine.durOptions & CmdLine::DurParanoid) )
+                return;
+
+            unsigned long long bytes = 0;
+            Timer t;
+            MongoFile::forEach(validateSingleMapMatches(bytes));
+            OCCASIONALLY log() << "DurParanoid map check " << t.millis() << "ms for " <<  (bytes / (1024*1024)) << "MB" << endl;
+        }
+
+        extern size_t privateMapBytes;
+
+        static void _REMAPPRIVATEVIEW() {
+            // todo: Consider using ProcessInfo herein and watching for getResidentSize to drop.  that could be a way 
+            //       to assure very good behavior here.
+
+            static unsigned startAt;
+            static unsigned long long lastRemap;
+
+            LOG(4) << "journal REMAPPRIVATEVIEW" << endl;
+
+            d.dbMutex.assertWriteLocked();
+            d.dbMutex._remapPrivateViewRequested = false;
+            assert( !commitJob.hasWritten() );
+
+            // we want to remap all private views about every 2 seconds.  there could be ~1000 views so
+            // we do a little each pass; beyond the remap time, more significantly, there will be copy on write
+            // faults after remapping, so doing a little bit at a time will avoid big load spikes on
+            // remapping.
+            unsigned long long now = curTimeMicros64();
+            double fraction = (now-lastRemap)/2000000.0;
+            if( cmdLine.durOptions & CmdLine::DurAlwaysRemap )
+                fraction = 1;
+            lastRemap = now;
+
+            LockMongoFilesShared lk;
+            set<MongoFile*>& files = MongoFile::getAllFiles();
+            unsigned sz = files.size();
+            if( sz == 0 )
+                return;
+
+            {
+                // be careful not to use too much memory if the write rate is 
+                // extremely high
+                double f = privateMapBytes / ((double)UncommittedBytesLimit);
+                if( f > fraction ) { 
+                    fraction = f;
+                }
+                privateMapBytes = 0;
+            }
+
+            unsigned ntodo = (unsigned) (sz * fraction);
+            if( ntodo < 1 ) ntodo = 1;
+            if( ntodo > sz ) ntodo = sz;
+
+            const set<MongoFile*>::iterator b = files.begin();
+            const set<MongoFile*>::iterator e = files.end();
+            set<MongoFile*>::iterator i = b;
+            // skip to our starting position
+            for( unsigned x = 0; x < startAt; x++ ) {
+                i++;
+                if( i == e ) i = b;
+            }
+            unsigned startedAt = startAt;
+            startAt = (startAt + ntodo) % sz; // mark where to start next time
+
+            Timer t;
+            for( unsigned x = 0; x < ntodo; x++ ) {
+                dassert( i != e );
+                if( (*i)->isMongoMMF() ) {
+                    MongoMMF *mmf = (MongoMMF*) *i;
+                    assert(mmf);
+                    if( mmf->willNeedRemap() ) {
+                        mmf->willNeedRemap() = false;
+                        mmf->remapThePrivateView();
+                    }
+                    i++;
+                    if( i == e ) i = b;
+                }
+            }
+            LOG(2) << "journal REMAPPRIVATEVIEW done startedAt: " << startedAt << " n:" << ntodo << ' ' << t.millis() << "ms" << endl;
+        }
+
+        /** We need to remap the private views periodically. otherwise they would become very large.
+            Call within write lock.  See top of file for more commentary.
+        */
+        void REMAPPRIVATEVIEW() {
+            Timer t;
+            _REMAPPRIVATEVIEW();
+            stats.curr->_remapPrivateViewMicros += t.micros();
+        }
+
+        // lock order: dbMutex first, then this
+        mutex groupCommitMutex("groupCommit");
+
+        bool _groupCommitWithLimitedLocks() {
+
+            int p = 0;
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            scoped_ptr<ExcludeAllWrites> lk1( new ExcludeAllWrites() );
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            scoped_lock lk2(groupCommitMutex);
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            commitJob.beginCommit();
+
+            if( !commitJob.hasWritten() ) {
+                // getlasterror request could have came after the data was already committed
+                commitJob.notifyCommitted();
+                return true;
+            }
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            JSectHeader h;
+            PREPLOGBUFFER(h); // need to be in readlock (writes excluded) for this
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            LockMongoFilesShared lk3;
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            unsigned abLen = commitJob._ab.len();
+            commitJob.reset(); // must be reset before allowing anyone to write
+            DEV assert( !commitJob.hasWritten() );
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            // release the readlock -- allowing others to now write while we are writing to the journal (etc.)
+            lk1.reset();
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            // ****** now other threads can do writes ******
+
+            WRITETOJOURNAL(h, commitJob._ab);
+            assert( abLen == commitJob._ab.len() ); // a check that no one touched the builder while we were doing work. if so, our locking is wrong.
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            // data is now in the journal, which is sufficient for acknowledging getLastError.
+            // (ok to crash after that)
+            commitJob.notifyCommitted();
+
+            LOG(4) << "groupcommitll " << p++ << " WRITETODATAFILES()" << endl;
+
+            WRITETODATAFILES(h, commitJob._ab);
+            assert( abLen == commitJob._ab.len() ); // check again wasn't modded
+            commitJob._ab.reset();
+
+            LOG(4) << "groupcommitll " << p++ << endl;
+
+            // can't : d.dbMutex._remapPrivateViewRequested = true;
+
+            return true;
+        }
+
+        /** @return true if committed; false if lock acquisition timed out (we only try for a read lock herein and only wait for a certain duration). */
+        bool groupCommitWithLimitedLocks() {
+            try {
+                return _groupCommitWithLimitedLocks();
+            }
+            catch(DBException& e ) {
+                log() << "dbexception in groupCommitLL causing immediate shutdown: " << e.toString() << endl;
+                mongoAbort("dur1");
+            }
+            catch(std::ios_base::failure& e) {
+                log() << "ios_base exception in groupCommitLL causing immediate shutdown: " << e.what() << endl;
+                mongoAbort("dur2");
+            }
+            catch(std::bad_alloc& e) {
+                log() << "bad_alloc exception in groupCommitLL causing immediate shutdown: " << e.what() << endl;
+                mongoAbort("dur3");
+            }
+            catch(std::exception& e) {
+                log() << "exception in dur::groupCommitLL causing immediate shutdown: " << e.what() << endl;
+                mongoAbort("dur4");
+            }
+            return false;
+        }
+
+        static void _groupCommit() {
+
+            LOG(4) << "_groupCommit " << endl;
+
+            // we need to be at least read locked on the dbMutex so that we know the write intent data 
+            // structures are not changing while we work
+            d.dbMutex.assertAtLeastReadLocked();
+
+            commitJob.beginCommit();
+
+            if( !commitJob.hasWritten() ) {
+                // getlasterror request could have came after the data was already committed
+                commitJob.notifyCommitted();
+                return;
+            }
+
+            // we need to make sure two group commits aren't running at the same time
+            // (and we are only read locked in the dbMutex, so it could happen)
+            scoped_lock lk(groupCommitMutex);
+
+            JSectHeader h;
+            PREPLOGBUFFER(h);
+
+            // todo : write to the journal outside locks, as this write can be slow.
+            //        however, be careful then about remapprivateview as that cannot be done 
+            //        if new writes are then pending in the private maps.
+            WRITETOJOURNAL(h, commitJob._ab);
+
+            // data is now in the journal, which is sufficient for acknowledging getLastError.
+            // (ok to crash after that)
+            commitJob.notifyCommitted();
+
+            WRITETODATAFILES(h, commitJob._ab);
+            debugValidateAllMapsMatch();
+
+            commitJob.reset();
+            commitJob._ab.reset();
+
+            // REMAPPRIVATEVIEW
+            //
+            // remapping private views must occur after WRITETODATAFILES otherwise
+            // we wouldn't see newly written data on reads.
+            //
+            DEV assert( !commitJob.hasWritten() );
+            if( !d.dbMutex.isWriteLocked() ) {
+                // this needs done in a write lock (as there is a short window during remapping when each view 
+                // might not exist) thus we do it on the next acquisition of that instead of here (there is no 
+                // rush if you aren't writing anyway -- but it must happen, if it is done, before any uncommitted 
+                // writes occur).  If desired, perhaps this can be eliminated on posix as it may be that the remap 
+                // is race-free there.
+                //
+                d.dbMutex._remapPrivateViewRequested = true;
+            }
+            else {
+                stats.curr->_commitsInWriteLock++;
+                // however, if we are already write locked, we must do it now -- up the call tree someone
+                // may do a write without a new lock acquisition.  this can happen when MongoMMF::close() calls
+                // this method when a file (and its views) is about to go away.
+                //
+                REMAPPRIVATEVIEW();
+            }
+        }
+
+        /** locking: in read lock when called
+                     or, for early commits (commitIfNeeded), in write lock
+            @see MongoMMF::close()
+        */
+        static void groupCommit() {
+            try {
+                _groupCommit();
+            }
+            catch(DBException& e ) { 
+                log() << "dbexception in groupCommit causing immediate shutdown: " << e.toString() << endl;
+                mongoAbort("gc1");
+            }
+            catch(std::ios_base::failure& e) { 
+                log() << "ios_base exception in groupCommit causing immediate shutdown: " << e.what() << endl;
+                mongoAbort("gc2");
+            }
+            catch(std::bad_alloc& e) { 
+                log() << "bad_alloc exception in groupCommit causing immediate shutdown: " << e.what() << endl;
+                mongoAbort("gc3");
+            }
+            catch(std::exception& e) {
+                log() << "exception in dur::groupCommit causing immediate shutdown: " << e.what() << endl;
+                mongoAbort("gc4");
+            }
+            LOG(4) << "groupCommit end" << endl;
+        }
+
+        static void go() {
+            const int N = 10;
+            static int n;
+            if( privateMapBytes < UncommittedBytesLimit && ++n % N && (cmdLine.durOptions&CmdLine::DurAlwaysRemap)==0 ) {
+                // limited locks version doesn't do any remapprivateview at all, so only try this if privateMapBytes
+                // is in an acceptable range.  also every Nth commit, we do everything so we can do some remapping;
+                // remapping a lot all at once could cause jitter from a large amount of copy-on-writes all at once.
+                if( groupCommitWithLimitedLocks() )
+                    return;
+            }
+            else {
+                readlocktry lk("", 1000);
+                if( lk.got() ) {
+                    groupCommit();
+                    return;
+                }
+            }
+
+            // starvation on read locks could occur.  so if read lock acquisition is slow, try to get a
+            // write lock instead.  otherwise journaling could be delayed too long (too much data will 
+            // not accumulate though, as commitIfNeeded logic will have executed in the meantime if there 
+            // has been writes)
+            writelock lk;
+            groupCommit();
+        }
+
+        /** called when a MongoMMF is closing -- we need to go ahead and group commit in that case before its
+            views disappear
+        */
+        void closingFileNotification() {
+            if (!cmdLine.dur)
+                return;
+
+            if( d.dbMutex.atLeastReadLocked() ) {
+                groupCommit();
+            }
+            else {
+                assert( inShutdown() );
+                if( commitJob.hasWritten() ) {
+                    log() << "journal warning files are closing outside locks with writes pending" << endl;
+                }
+            }
+        }
+
+        extern int groupCommitIntervalMs;
+        boost::filesystem::path getJournalDir();
+
+        void durThread() {
+            Client::initThread("journal");
+
+            bool samePartition = true;
+            try {
+                const string dbpathDir = boost::filesystem::path(dbpath).native_directory_string();
+                samePartition = onSamePartition(getJournalDir().string(), dbpathDir);
+            }
+            catch(...) {
+            }
+
+            while( !inShutdown() ) {
+                RACECHECK
+
+                unsigned ms = cmdLine.journalCommitInterval;
+                if( ms == 0 ) { 
+                    // use default
+                    ms = samePartition ? 100 : 30;
+                }
+
+                unsigned oneThird = (ms / 3) + 1; // +1 so never zero
+
+                try {
+                    stats.rotate();
+
+                    // we do this in a couple blocks (the invoke()), which makes it a tiny bit faster (only a little) on throughput,
+                    // but is likely also less spiky on our cpu usage, which is good.
+
+                    // commit sooner if one or more getLastError j:true is pending
+                    sleepmillis(oneThird);
+                    for( unsigned i = 1; i <= 2; i++ ) {
+                        if( commitJob._notify.nWaiting() )
+                            break;
+                        commitJob.wi()._deferred.invoke();
+                        sleepmillis(oneThird);
+                    }
+
+                    go();
+                }
+                catch(std::exception& e) {
+                    log() << "exception in durThread causing immediate shutdown: " << e.what() << endl;
+                    mongoAbort("exception in durThread");
+                }
+            }
+            cc().shutdown();
+        }
+
+        void recover();
+
+        unsigned notesThisLock = 0;
+
+        void releasingWriteLock() {
+            DEV notesThisLock = 0;
+            // implicit commitIfNeeded check on each write unlock
+            DEV commitJob._nSinceCommitIfNeededCall = 0; // implicit commit if needed
+            if( commitJob.bytes() > UncommittedBytesLimit || cmdLine.durOptions & CmdLine::DurAlwaysCommit ) {
+                stats.curr->_earlyCommits++;
+                groupCommit();
+            }
+        }
+
+        void preallocateFiles();
+
+        /** at startup, recover, and then start the journal threads */
+        void startup() {
+            if( !cmdLine.dur )
+                return;
+
+#if defined(_DURABLEDEFAULTON)
+            DEV { 
+                if( time(0) & 1 ) {
+                    cmdLine.durOptions |= CmdLine::DurAlwaysCommit;
+                    log() << "_DEBUG _DURABLEDEFAULTON : forcing DurAlwaysCommit mode for this run" << endl;
+                }
+                if( time(0) & 2 ) {
+                    cmdLine.durOptions |= CmdLine::DurAlwaysRemap;
+                    log() << "_DEBUG _DURABLEDEFAULTON : forcing DurAlwaysRemap mode for this run" << endl;
+                }
+            }
+#endif
+
+            DurableInterface::enableDurability();
+
+            journalMakeDir();
+            try {
+                recover();
+            }
+            catch(...) {
+                log() << "exception during recovery" << endl;
+                throw;
+            }
+
+            preallocateFiles();
+
+            boost::thread t(durThread);
+        }
+
+        void DurableImpl::syncDataAndTruncateJournal() {
+            d.dbMutex.assertWriteLocked();
+
+            // a commit from the commit thread won't begin while we are in the write lock,
+            // but it may already be in progress and the end of that work is done outside 
+            // (dbMutex) locks. This line waits for that to complete if already underway.
+            {
+                scoped_lock lk(groupCommitMutex);
+            }
+
+            groupCommit();
+            MongoFile::flushAll(true);
+            journalCleanup();
+
+            assert(!haveJournalFiles()); // Double check post-conditions
+        }
+
+    } // namespace dur
+
+} // namespace mongo
diff --git a/src/mongo/db/dur.h b/src/mongo/db/dur.h
new file mode 100644
index 00000000000..f06ff500195
--- /dev/null
+++ b/src/mongo/db/dur.h
@@ -0,0 +1,209 @@
+// @file dur.h durability support
+
+#pragma once
+
+#include "diskloc.h"
+#include "mongommf.h"
+
+namespace mongo {
+
+    class NamespaceDetails;
+
+    void mongoAbort(const char *msg);
+    void abort(); // not defined -- use mongoAbort() instead
+
+    namespace dur {
+
+        // a smaller limit is likely better on 32 bit
+#if defined(__i386__) || defined(_M_IX86)
+        const unsigned UncommittedBytesLimit = 50 * 1024 * 1024;
+#else
+        const unsigned UncommittedBytesLimit = 100 * 1024 * 1024;
+#endif
+
+        /** Call during startup so durability module can initialize
+            Throws if fatal error
+            Does nothing if cmdLine.dur is false
+         */
+        void startup();
+
+        class DurableInterface : boost::noncopyable {
+        public:
+            virtual ~DurableInterface() { log() << "ERROR warning ~DurableInterface not intended to be called" << endl; }
+
+            /** Declare that a file has been created
+                Normally writes are applied only after journaling, for safety.  But here the file
+                is created first, and the journal will just replay the creation if the create didn't
+                happen because of crashing.
+            */
+            virtual void createdFile(string filename, unsigned long long len) = 0;
+
+            /** Declarations of write intent.
+
+                Use these methods to declare "i'm about to write to x and it should be logged for redo."
+
+                Failure to call writing...() is checked in _DEBUG mode by using a read only mapped view
+                (i.e., you'll segfault if the code is covered in that situation).  The _DEBUG check doesn't
+                verify that your length is correct though.
+            */
+
+            /** declare intent to write to x for up to len
+                @return pointer where to write.  this is modified when testIntent is true.
+            */
+            virtual void* writingPtr(void *x, unsigned len) = 0;
+
+            /** declare write intent; should already be in the write view to work correctly when testIntent is true.
+                if you aren't, use writingPtr() instead.
+            */
+            virtual void declareWriteIntent(void *x, unsigned len) = 0;
+
+            /** declare intent to write
+                @param ofs offset within buf at which we will write
+                @param len the length at ofs we will write
+                @return new buffer pointer.  this is modified when testIntent is true.
+            */
+            virtual void* writingAtOffset(void *buf, unsigned ofs, unsigned len) = 0;
+
+            /** declare intent to write
+                @param ranges vector of pairs representing ranges.  Each pair
+                comprises an offset from buf where a range begins, then the
+                range length.
+                @return new buffer pointer.  this is modified when testIntent is true.
+             */
+            virtual void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges ) = 0;
+
+            /** Wait for acknowledgement of the next group commit.
+                @return true if --dur is on.  There will be delay.
+                @return false if --dur is off.
+            */
+            virtual bool awaitCommit() = 0;
+
+            /** Commit immediately.
+
+                Generally, you do not want to do this often, as highly granular committing may affect
+                performance.
+
+                Does not return until the commit is complete.
+
+                You must be at least read locked when you call this.  Ideally, you are not write locked
+                and then read operations can occur concurrently.
+
+                @return true if --dur is on.
+                @return false if --dur is off. (in which case there is action)
+            */
+            virtual bool commitNow() = 0;
+
+            /** Commit if enough bytes have been modified. Current threshold is 50MB
+
+                The idea is that long running write operations that dont yield
+                (like creating an index or update with $atomic) can call this
+                whenever the db is in a sane state and it will prevent commits
+                from growing too large.
+                @return true if commited
+            */
+            virtual bool commitIfNeeded() = 0;
+
+            /** @return true if time to commit but does NOT do a commit */
+            virtual bool aCommitIsNeeded() const = 0;
+
+            /** Declare write intent for a DiskLoc.  @see DiskLoc::writing() */
+            inline DiskLoc& writingDiskLoc(DiskLoc& d) { return *((DiskLoc*) writingPtr(&d, sizeof(d))); }
+
+            /** Declare write intent for an int */
+            inline int& writingInt(const int& d) { return *((int*) writingPtr((int*) &d, sizeof(d))); }
+
+            /** "assume i've already indicated write intent, let me write"
+                redeclaration is fine too, but this is faster.
+            */
+            template <typename T>
+            inline
+            T* alreadyDeclared(T *x) {
+#if defined(_TESTINTENT)
+                return (T*) MongoMMF::switchToPrivateView(x);
+#else
+                return x;
+#endif
+            }
+
+            /** declare intent to write to x for sizeof(*x) */
+            template <typename T>
+            inline
+            T* writing(T *x) {
+                return (T*) writingPtr(x, sizeof(T));
+            }
+
+            /** write something that doesn't have to be journaled, as this write is "unimportant".
+                a good example is paddingFactor.
+                can be thought of as memcpy(dst,src,len)
+                the dur implementation acquires a mutex in this method, so do not assume it is faster
+                without measuring!
+            */
+            virtual void setNoJournal(void *dst, void *src, unsigned len) = 0;
+
+            /** Commits pending changes, flushes all changes to main data
+                files, then removes the journal.
+                
+                This is useful as a "barrier" to ensure that writes before this
+                call will never go through recovery and be applied to files
+                that have had changes made after this call applied.
+             */
+            virtual void syncDataAndTruncateJournal() = 0;
+
+            static DurableInterface& getDur() { return *_impl; }
+
+        private:
+            /** Intentionally unimplemented method.
+             It's very easy to manipulate Record::data open ended.  Thus a call to writing(Record*) is suspect.
+             This will override the templated version and yield an unresolved external.
+             */
+            Record* writing(Record* r);
+            /** Intentionally unimplemented method. BtreeBuckets are allocated in buffers larger than sizeof( BtreeBucket ). */
+//            BtreeBucket* writing( BtreeBucket* );
+            /** Intentionally unimplemented method. NamespaceDetails may be based on references to 'Extra' objects. */
+            NamespaceDetails* writing( NamespaceDetails* );
+
+            static DurableInterface* _impl; // NonDurableImpl at startup()
+            static void enableDurability(); // makes _impl a DurableImpl
+            static void disableDurability(); // makes _impl a NonDurableImpl
+
+            // these need to be able to enable/disable Durability
+            friend void startup();
+            friend class TempDisableDurability;
+        }; // class DurableInterface
+
+        class NonDurableImpl : public DurableInterface {
+            void* writingPtr(void *x, unsigned len) { return x; }
+            void* writingAtOffset(void *buf, unsigned ofs, unsigned len) { return buf; }
+            void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges) { return buf; }
+            void declareWriteIntent(void *, unsigned) { }
+            void createdFile(string filename, unsigned long long len) { }
+            bool awaitCommit() { return false; }
+            bool commitNow() { return false; }
+            bool commitIfNeeded() { return false; }
+            bool aCommitIsNeeded() const { return false; }
+            void setNoJournal(void *dst, void *src, unsigned len);
+            void syncDataAndTruncateJournal() {}
+        };
+
+        class DurableImpl : public DurableInterface {
+            void* writingPtr(void *x, unsigned len);
+            void* writingAtOffset(void *buf, unsigned ofs, unsigned len);
+            void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges);
+            void declareWriteIntent(void *, unsigned);
+            void createdFile(string filename, unsigned long long len);
+            bool awaitCommit();
+            bool commitNow();
+            bool aCommitIsNeeded() const;
+            bool commitIfNeeded();
+            void setNoJournal(void *dst, void *src, unsigned len);
+            void syncDataAndTruncateJournal();
+        };
+
+    } // namespace dur
+
+    inline dur::DurableInterface& getDur() { return dur::DurableInterface::getDur(); }
+
+    /** declare that we are modifying a diskloc and this is a datafile write. */
+    inline DiskLoc& DiskLoc::writing() const { return getDur().writingDiskLoc(*const_cast< DiskLoc * >( this )); }
+
+}
diff --git a/src/mongo/db/dur_commitjob.cpp b/src/mongo/db/dur_commitjob.cpp
new file mode 100644
index 00000000000..5a9e9cb5679
--- /dev/null
+++ b/src/mongo/db/dur_commitjob.cpp
@@ -0,0 +1,240 @@
+/* @file dur_commitjob.cpp */
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "dur_commitjob.h"
+#include "dur_stats.h"
+#include "taskqueue.h"
+#include "client.h"
+
+namespace mongo {
+
+    namespace dur {
+
+        BOOST_STATIC_ASSERT( UncommittedBytesLimit > BSONObjMaxInternalSize * 3 );
+        BOOST_STATIC_ASSERT( sizeof(void*)==4 || UncommittedBytesLimit > BSONObjMaxInternalSize * 6 );
+
+        void Writes::D::go(const Writes::D& d) {
+            commitJob.wi()._insertWriteIntent(d.p, d.len);
+        }
+
+        void WriteIntent::absorb(const WriteIntent& other) {
+            dassert(overlaps(other));
+
+            void* newStart = min(start(), other.start());
+            p = max(p, other.p);
+            len = (char*)p - (char*)newStart;
+
+            dassert(contains(other));
+        }
+
+        void Writes::clear() {
+            d.dbMutex.assertAtLeastReadLocked();
+
+            _alreadyNoted.clear();
+            _writes.clear();
+            _ops.clear();
+            _drained = false;
+#if defined(DEBUG_WRITE_INTENT)
+            cout << "_debug clear\n";
+            _debug.clear();
+#endif
+        }
+
+#if defined(DEBUG_WRITE_INTENT)
+        void assertAlreadyDeclared(void *p, int len) {
+            if( commitJob.wi()._debug[p] >= len )
+                return;
+            log() << "assertAlreadyDeclared fails " << (void*)p << " len:" << len << ' ' << commitJob.wi()._debug[p] << endl;
+            printStackTrace();
+            abort();
+        }
+#endif
+
+        void Writes::_insertWriteIntent(void* p, int len) {
+            WriteIntent wi(p, len);
+
+            if (_writes.empty()) {
+                _writes.insert(wi);
+                return;
+            }
+
+            typedef set<WriteIntent>::const_iterator iterator; // shorter
+
+            iterator closest = _writes.lower_bound(wi);
+            // closest.end() >= wi.end()
+
+            if ((closest != _writes.end() && closest->overlaps(wi)) || // high end
+                    (closest != _writes.begin() && (--closest)->overlaps(wi))) { // low end
+                if (closest->contains(wi))
+                    return; // nothing to do
+
+                // find overlapping range and merge into wi
+                iterator   end(closest);
+                iterator begin(closest);
+                while (  end->overlaps(wi)) { wi.absorb(*end); ++end; if (end == _writes.end()) break; }  // look forwards
+                while (begin->overlaps(wi)) { wi.absorb(*begin); if (begin == _writes.begin()) break; --begin; } // look backwards
+                if (!begin->overlaps(wi)) ++begin; // make inclusive
+
+                DEV { // ensure we're not deleting anything we shouldn't
+                    for (iterator it(begin); it != end; ++it) {
+                        assert(wi.contains(*it));
+                    }
+                }
+
+                _writes.erase(begin, end);
+                _writes.insert(wi);
+
+                DEV { // ensure there are no overlaps
+                    // this can be very slow - n^2 - so make it RARELY
+                    RARELY {
+                        for (iterator it(_writes.begin()), end(boost::prior(_writes.end())); it != end; ++it) {
+                            assert(!it->overlaps(*boost::next(it)));
+                        }
+                    }
+                }
+            }
+            else { // no entries overlapping wi
+                _writes.insert(closest, wi);
+            }
+        }
+
+        /** note an operation other than a "basic write" */
+        void CommitJob::noteOp(shared_ptr<DurOp> p) {
+            d.dbMutex.assertWriteLocked();
+            dassert( cmdLine.dur );
+            cc()._hasWrittenThisPass = true;
+            if( !_hasWritten ) {
+                assert( !d.dbMutex._remapPrivateViewRequested );
+                _hasWritten = true;
+            }
+            _wi._ops.push_back(p);
+        }
+
+        size_t privateMapBytes = 0; // used by _REMAPPRIVATEVIEW to track how much / how fast to remap
+
+        void CommitJob::beginCommit() { 
+            DEV d.dbMutex.assertAtLeastReadLocked();
+            _commitNumber = _notify.now();
+            stats.curr->_commits++;
+        }
+
+        void CommitJob::reset() {
+            _hasWritten = false;
+            _wi.clear();
+            privateMapBytes += _bytes;
+            _bytes = 0;
+            _nSinceCommitIfNeededCall = 0;
+        }
+
+        CommitJob::CommitJob() : _ab(4 * 1024 * 1024) , _hasWritten(false), 
+            _bytes(0), _nSinceCommitIfNeededCall(0) { 
+            _commitNumber = 0;
+        }
+
+        extern unsigned notesThisLock;
+
+        void CommitJob::note(void* p, int len) {
+            // from the point of view of the dur module, it would be fine (i think) to only
+            // be read locked here.  but must be at least read locked to avoid race with
+            // remapprivateview
+            DEV notesThisLock++;
+            DEV d.dbMutex.assertWriteLocked();
+            dassert( cmdLine.dur );
+            cc()._hasWrittenThisPass = true;
+            if( !_wi._alreadyNoted.checkAndSet(p, len) ) {
+                MemoryMappedFile::makeWritable(p, len);
+
+                if( !_hasWritten ) {
+                    // you can't be writing if one of these is pending, so this is a verification.
+                    assert( !d.dbMutex._remapPrivateViewRequested ); // safe to assert here since it must be the first write in a write lock
+
+                    // we don't bother doing a group commit when nothing is written, so we have a var to track that
+                    _hasWritten = true;
+                }
+
+                /** tips for debugging:
+                        if you have an incorrect diff between data files in different folders
+                        (see jstests/dur/quick.js for example),
+                        turn this on and see what is logged.  if you have a copy of its output from before the
+                        regression, a simple diff of these lines would tell you a lot likely.
+                */
+#if 0 && defined(_DEBUG)
+                {
+                    static int n;
+                    if( ++n < 10000 ) {
+                        size_t ofs;
+                        MongoMMF *mmf = privateViews._find(w.p, ofs);
+                        if( mmf ) {
+                            log() << "DEBUG note write intent " << w.p << ' ' << mmf->filename() << " ofs:" << hex << ofs << " len:" << w.len << endl;
+                        }
+                        else {
+                            log() << "DEBUG note write intent " << w.p << ' ' << w.len << " NOT FOUND IN privateViews" << endl;
+                        }
+                    }
+                    else if( n == 10000 ) {
+                        log() << "DEBUG stopping write intent logging, too much to log" << endl;
+                    }
+                }
+#endif
+
+                // remember intent. we will journal it in a bit
+                _wi.insertWriteIntent(p, len);
+                wassert( _wi._writes.size() <  2000000 );
+                //assert(  _wi._writes.size() < 20000000 );
+
+                {
+                    // a bit over conservative in counting pagebytes used
+                    static size_t lastPos; // note this doesn't reset with each commit, but that is ok we aren't being that precise
+                    size_t x = ((size_t) p) & ~0xfff; // round off to page address (4KB)
+                    if( x != lastPos ) { 
+                        lastPos = x;
+                        unsigned b = (len+4095) & ~0xfff;
+                        _bytes += b;
+#if defined(_DEBUG)
+                        _nSinceCommitIfNeededCall++;
+                        if( _nSinceCommitIfNeededCall >= 80 ) {
+                            if( _nSinceCommitIfNeededCall % 40 == 0 ) {
+                                log() << "debug nsincecommitifneeded:" << _nSinceCommitIfNeededCall << " bytes:" << _bytes << endl;
+                                if( _nSinceCommitIfNeededCall == 120 || _nSinceCommitIfNeededCall == 1200 ) {
+                                    log() << "_DEBUG printing stack given high nsinccommitifneeded number" << endl;
+                                    printStackTrace();
+                                }
+                            }
+                        }
+#endif
+                        if (_bytes > UncommittedBytesLimit * 3) {
+                            static time_t lastComplain;
+                            static unsigned nComplains;
+                            // throttle logging
+                            if( ++nComplains < 100 || time(0) - lastComplain >= 60 ) {
+                                lastComplain = time(0);
+                                warning() << "DR102 too much data written uncommitted " << _bytes/1000000.0 << "MB" << endl;
+                                if( nComplains < 10 || nComplains % 10 == 0 ) {
+                                    // wassert makes getLastError show an error, so we just print stack trace
+                                    printStackTrace();
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+    }
+}
diff --git a/src/mongo/db/dur_commitjob.h b/src/mongo/db/dur_commitjob.h
new file mode 100644
index 00000000000..bfc5e3c268f
--- /dev/null
+++ b/src/mongo/db/dur_commitjob.h
@@ -0,0 +1,220 @@
+/* @file dur_commitjob.h used by dur.cpp
+*/
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../util/alignedbuilder.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/concurrency/synchronization.h"
+#include "cmdline.h"
+#include "durop.h"
+#include "dur.h"
+#include "taskqueue.h"
+
+//#define DEBUG_WRITE_INTENT 1
+
+namespace mongo {
+    namespace dur {
+
+        /** declaration of an intent to write to a region of a memory mapped view
+         *
+         * We store the end rather than the start pointer to make operator< faster
+         * since that is heavily used in set lookup.
+         */
+        struct WriteIntent { /* copyable */
+            WriteIntent() : /*w_ptr(0), */ p(0) { }
+            WriteIntent(void *a, unsigned b) : /*w_ptr(0), */ p((char*)a+b), len(b) { }
+
+            void* start() const { return (char*)p - len; }
+            void* end() const { return p; }
+            unsigned length() const { return len; }
+
+            bool operator < (const WriteIntent& rhs) const { return end() < rhs.end(); }
+
+            // can they be merged?
+            bool overlaps(const WriteIntent& rhs) const {
+                return (start() <= rhs.end() && end() >= rhs.start());
+            }
+
+            // is merging necessary?
+            bool contains(const WriteIntent& rhs) const {
+                return (start() <= rhs.start() && end() >= rhs.end());
+            }
+
+            // merge into me
+            void absorb(const WriteIntent& other);
+
+            friend ostream& operator << (ostream& out, const WriteIntent& wi) {
+                return (out << "p: " << wi.p << " end: " << wi.end() << " len: " << wi.len);
+            }
+
+            //mutable void *w_ptr;  // writable mapping of p.
+            // mutable because set::iterator is const but this isn't used in op<
+#if defined(_EXPERIMENTAL)
+            mutable unsigned ofsInJournalBuffer;
+#endif
+        private:
+            void *p;      // intent to write up to p
+            unsigned len; // up to this len
+        };
+
+        /** try to remember things we have already marked for journaling.  false negatives are ok if infrequent -
+            we will just log them twice.
+        */
+        template<int Prime>
+        class Already : boost::noncopyable {
+        public:
+            Already() { clear(); }
+            void clear() { memset(this, 0, sizeof(*this)); }
+
+            /* see if we have Already recorded/indicated our write intent for this region of memory.
+               automatically upgrades the length if the length was shorter previously.
+               @return true if already indicated.
+            */
+            bool checkAndSet(void* p, int len) {
+                unsigned x = mongoutils::hashPointer(p);
+                pair<void*, int>& nd = nodes[x % N];
+                if( nd.first == p ) {
+                    if( nd.second < len ) {
+                        nd.second = len;
+                        return false; // haven't indicated this len yet
+                    }
+                    return true; // already indicated
+                }
+                nd.first = p;
+                nd.second = len;
+                return false; // a new set
+            }
+
+        private:
+            enum { N = Prime }; // this should be small the idea is that it fits in the cpu cache easily
+            pair<void*,int> nodes[N];
+        };
+
+        /** our record of pending/uncommitted write intents */
+        class Writes : boost::noncopyable {
+            struct D {
+                void *p;
+                unsigned len;
+                static void go(const D& d);
+            };
+        public:
+            TaskQueue<D> _deferred;
+            Already<127> _alreadyNoted;
+            set<WriteIntent> _writes;
+            vector< shared_ptr<DurOp> > _ops; // all the ops other than basic writes
+            bool _drained; // _deferred is drained?  for asserting/testing
+
+            /** reset the Writes structure (empties all the above) */
+            void clear();
+
+            /** merges into set (ie non-deferred version) */
+            void _insertWriteIntent(void* p, int len);
+
+            void insertWriteIntent(void* p, int len) {
+#if defined(DEBUG_WRITE_INTENT)
+                if( _debug[p] < len )
+                    _debug[p] = len;
+#endif
+                D d;
+                d.p = p;
+                d.len = len;
+                _deferred.defer(d);
+            }
+
+#ifdef _DEBUG
+            WriteIntent _last;
+#endif
+#if defined(DEBUG_WRITE_INTENT)
+            map<void*,int> _debug;
+#endif
+        };
+
+#if defined(DEBUG_WRITE_INTENT)
+        void assertAlreadyDeclared(void *, int len);
+#else
+        inline void assertAlreadyDeclared(void *, int len) { }
+#endif
+
+        /** A commit job object for a group commit.  Currently there is one instance of this object.
+
+            concurrency: assumption is caller is appropriately locking.
+                         for example note() invocations are from the write lock.
+                         other uses are in a read lock from a single thread (durThread)
+        */
+        class CommitJob : boost::noncopyable {
+        public:
+            AlignedBuilder _ab; // for direct i/o writes to journal
+
+            CommitJob();
+
+            ~CommitJob(){ assert(!"shouldn't destroy CommitJob!"); }
+
+            /** record/note an intent to write */
+            void note(void* p, int len);
+
+            /** note an operation other than a "basic write" */
+            void noteOp(shared_ptr<DurOp> p);
+
+            set<WriteIntent>& writes() {
+                if( !_wi._drained ) {
+                    // generally, you don't want to use the set until it is prepared (after deferred ops are applied)
+                    // thus this assert here.
+                    assert(false);
+                }
+                return _wi._writes;
+            }
+
+            vector< shared_ptr<DurOp> >& ops() { return _wi._ops; }
+
+            /** this method is safe to call outside of locks. when haswritten is false we don't do any group commit and avoid even
+                trying to acquire a lock, which might be helpful at times.
+            */
+            bool hasWritten() const { return _hasWritten; }
+
+            /** we use the commitjob object over and over, calling reset() rather than reconstructing */
+            void reset();
+
+            void beginCommit();
+
+            /** the commit code calls this when data reaches the journal (on disk) */
+            void notifyCommitted() { _notify.notifyAll(_commitNumber); }
+
+            /** we check how much written and if it is getting to be a lot, we commit sooner. */
+            size_t bytes() const { return _bytes; }
+
+#if defined(_DEBUG)
+            const WriteIntent& lastWrite() const { return _wi._last; }
+#endif
+
+            Writes& wi() { return _wi; }
+        private:
+            NotifyAll::When _commitNumber;
+            bool _hasWritten;
+            Writes _wi; // todo: fix name
+            size_t _bytes;
+        public:
+            NotifyAll _notify; // for getlasterror fsync:true acknowledgements
+            unsigned _nSinceCommitIfNeededCall;
+        };
+
+        extern CommitJob& commitJob;
+
+    }
+}
diff --git a/src/mongo/db/dur_journal.cpp b/src/mongo/db/dur_journal.cpp
new file mode 100644
index 00000000000..6a6609f55ee
--- /dev/null
+++ b/src/mongo/db/dur_journal.cpp
@@ -0,0 +1,748 @@
+// @file dur_journal.cpp writing to the writeahead logging journal
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "client.h"
+#include "namespace.h"
+#include "dur_journal.h"
+#include "dur_journalformat.h"
+#include "dur_stats.h"
+#include "../util/logfile.h"
+#include "../util/timer.h"
+#include "../util/alignedbuilder.h"
+#include "../util/net/listen.h" // getelapsedtimemillis
+#include <boost/static_assert.hpp>
+#include <boost/filesystem.hpp>
+#undef assert
+#define assert MONGO_assert
+#include "../util/mongoutils/str.h"
+#include "dur_journalimpl.h"
+#include "../util/file.h"
+#include "../util/checksum.h"
+#include "../util/concurrency/race.h"
+#include "../util/compress.h"
+#include "../server.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    class AlignedBuilder;
+
+    unsigned goodRandomNumberSlow();
+
+    namespace dur {
+        // Rotate after reaching this data size in a journal (j._<n>) file
+        // We use a smaller size for 32 bit as the journal is mmapped during recovery (only)
+        // Note if you take a set of datafiles, including journal files, from 32->64 or vice-versa, it must 
+        // work.  (and should as-is)
+        // --smallfiles makes the limit small.
+
+#if defined(_DEBUG)
+        unsigned long long DataLimitPerJournalFile = 128 * 1024 * 1024;
+#elif defined(__APPLE__)
+        // assuming a developer box if OS X
+        unsigned long long DataLimitPerJournalFile = 256 * 1024 * 1024;
+#else
+        unsigned long long DataLimitPerJournalFile = (sizeof(void*)==4) ? 256 * 1024 * 1024 : 1 * 1024 * 1024 * 1024;
+#endif
+
+        BOOST_STATIC_ASSERT( sizeof(Checksum) == 16 );
+        BOOST_STATIC_ASSERT( sizeof(JHeader) == 8192 );
+        BOOST_STATIC_ASSERT( sizeof(JSectHeader) == 20 );
+        BOOST_STATIC_ASSERT( sizeof(JSectFooter) == 32 );
+        BOOST_STATIC_ASSERT( sizeof(JEntry) == 12 );
+        BOOST_STATIC_ASSERT( sizeof(LSNFile) == 88 );
+
+        bool usingPreallocate = false;
+
+        void removeOldJournalFile(path p);
+
+        boost::filesystem::path getJournalDir() {
+            boost::filesystem::path p(dbpath);
+            p /= "journal";
+            return p;
+        }
+
+        path lsnPath() {
+            return getJournalDir()/"lsn";
+        }
+
+        /** this should be called when something really bad happens so that we can flag appropriately
+        */
+        void journalingFailure(const char *msg) {
+            /** todo:
+                (1) don't log too much
+                (2) make an indicator in the journal dir that something bad happened.
+                (2b) refuse to do a recovery startup if that is there without manual override.
+            */
+            log() << "journaling failure/error: " << msg << endl;
+            assert(false);
+        }
+
+        JSectFooter::JSectFooter() { 
+            memset(this, 0, sizeof(*this));
+            sentinel = JEntry::OpCode_Footer;
+        }
+
+        JSectFooter::JSectFooter(const void* begin, int len) { // needs buffer to compute hash
+            sentinel = JEntry::OpCode_Footer;
+            reserved = 0;
+            magic[0] = magic[1] = magic[2] = magic[3] = '\n';
+
+            Checksum c;
+            c.gen(begin, (unsigned) len);
+            memcpy(hash, c.bytes, sizeof(hash));
+        }
+
+        bool JSectFooter::checkHash(const void* begin, int len) const {
+            if( !magicOk() ) { 
+                log() << "journal footer not valid" << endl;
+                return false;
+            }
+            Checksum c;
+            c.gen(begin, len);
+            DEV log() << "checkHash len:" << len << " hash:" << toHex(hash, 16) << " current:" << toHex(c.bytes, 16) << endl;
+            if( memcmp(hash, c.bytes, sizeof(hash)) == 0 ) 
+                return true;
+            log() << "journal checkHash mismatch, got: " << toHex(c.bytes, 16) << " expected: " << toHex(hash,16) << endl;
+            return false;
+        }
+
+        JHeader::JHeader(string fname) {
+            magic[0] = 'j'; magic[1] = '\n';
+            _version = CurrentVersion;
+            memset(ts, 0, sizeof(ts));
+            time_t t = time(0);
+            strncpy(ts, time_t_to_String_short(t).c_str(), sizeof(ts)-1);
+            memset(dbpath, 0, sizeof(dbpath));
+            strncpy(dbpath, fname.c_str(), sizeof(dbpath)-1);
+            {
+                fileId = t&0xffffffff;
+                fileId |= ((unsigned long long)goodRandomNumberSlow()) << 32;
+            }
+            memset(reserved3, 0, sizeof(reserved3));
+            txt2[0] = txt2[1] = '\n';
+            n1 = n2 = n3 = n4 = '\n';
+        }
+
+        Journal j;
+
+        const unsigned long long LsnShutdownSentinel = ~((unsigned long long)0);
+
+        Journal::Journal() :
+            _curLogFileMutex("JournalLfMutex") {
+            _ageOut = true;
+            _written = 0;
+            _nextFileNumber = 0;
+            _curLogFile = 0;
+            _curFileId = 0;
+            _preFlushTime = 0;
+            _lastFlushTime = 0;
+            _writeToLSNNeeded = false;
+        }
+
+        path Journal::getFilePathFor(int filenumber) const {
+            boost::filesystem::path p(dir);
+            p /= string(str::stream() << "j._" << filenumber);
+            return p;
+        }
+
+        /** never throws
+            @return true if journal dir is not empty
+        */
+        bool haveJournalFiles() {
+            try {
+                for ( boost::filesystem::directory_iterator i( getJournalDir() );
+                        i != boost::filesystem::directory_iterator();
+                        ++i ) {
+                    string fileName = boost::filesystem::path(*i).leaf();
+                    if( str::startsWith(fileName, "j._") )
+                        return true;
+                }
+            }
+            catch(...) { }
+            return false;
+        }
+
+        /** throws */
+        void removeJournalFiles() {
+            log() << "removeJournalFiles" << endl;
+            try {
+                for ( boost::filesystem::directory_iterator i( getJournalDir() );
+                        i != boost::filesystem::directory_iterator();
+                        ++i ) {
+                    string fileName = boost::filesystem::path(*i).leaf();
+                    if( str::startsWith(fileName, "j._") ) {
+                        try {
+                            removeOldJournalFile(*i);
+                        }
+                        catch(std::exception& e) {
+                            log() << "couldn't remove " << fileName << ' ' << e.what() << endl;
+                            throw;
+                        }
+                    }
+                }
+                try {
+                    boost::filesystem::remove(lsnPath());
+                }
+                catch(...) {
+                    log() << "couldn't remove " << lsnPath().string() << endl;
+                    throw;
+                }
+            }
+            catch( std::exception& e ) {
+                log() << "error removing journal files " << e.what() << endl;
+                throw;
+            }
+            assert(!haveJournalFiles());
+
+            flushMyDirectory(getJournalDir() / "file"); // flushes parent of argument (in this case journal dir)
+
+            log(1) << "removeJournalFiles end" << endl;
+        }
+
+        /** at clean shutdown */
+        bool okToCleanUp = false; // successful recovery would set this to true
+        void Journal::cleanup(bool _log) {
+            if( !okToCleanUp )
+                return;
+
+            if( _log )
+                log() << "journalCleanup..." << endl;
+            try {
+                SimpleMutex::scoped_lock lk(_curLogFileMutex);
+                closeCurrentJournalFile();
+                removeJournalFiles();
+            }
+            catch(std::exception& e) {
+                log() << "error couldn't remove journal file during shutdown " << e.what() << endl;
+                throw;
+            }
+        }
+        void journalCleanup(bool log) { j.cleanup(log); }
+
+        bool _preallocateIsFaster() {
+            bool faster = false;
+            boost::filesystem::path p = getJournalDir() / "tempLatencyTest";
+            try { remove(p); } catch(...) { }
+            try {
+                AlignedBuilder b(8192);
+                int millis[2];
+                const int N = 50;
+                for( int pass = 0; pass < 2; pass++ ) {
+                    LogFile f(p.string());
+                    Timer t;
+                    for( int i = 0 ; i < N; i++ ) { 
+                        f.synchronousAppend(b.buf(), 8192);
+                    }
+                    millis[pass] = t.millis();
+                    // second time through, file exists and is prealloc case
+                }
+                int diff = millis[0] - millis[1];
+                if( diff > 2 * N ) {
+                    // at least 2ms faster for prealloc case?
+                    faster = true;
+                    log() << "preallocateIsFaster=true " << diff / (1.0*N) << endl;
+                }
+            }
+            catch(...) {
+                log() << "info preallocateIsFaster couldn't run; returning false" << endl;
+            }
+            try { remove(p); } catch(...) { }
+            return faster;
+        }
+        bool preallocateIsFaster() {
+            Timer t;
+            bool res = false;
+            if( _preallocateIsFaster() && _preallocateIsFaster() ) { 
+                // maybe system is just super busy at the moment? sleep a second to let it calm down.  
+                // deciding to to prealloc is a medium big decision:
+                sleepsecs(1);
+                res = _preallocateIsFaster();
+            }
+            if( t.millis() > 3000 ) 
+                log() << "preallocateIsFaster check took " << t.millis()/1000.0 << " secs" << endl;
+            return res;
+        }
+
+        // throws
+        void preallocateFile(boost::filesystem::path p, unsigned long long len) {
+            if( exists(p) ) 
+                return;
+            
+            log() << "preallocating a journal file " << p.string() << endl;
+
+            const unsigned BLKSZ = 1024 * 1024;
+            assert( len % BLKSZ == 0 );
+
+            AlignedBuilder b(BLKSZ);            
+            memset((void*)b.buf(), 0, BLKSZ);
+
+            ProgressMeter m(len, 3/*secs*/, 10/*hits between time check (once every 6.4MB)*/);
+
+            File f;
+            f.open( p.string().c_str() , /*read-only*/false , /*direct-io*/false );
+            assert( f.is_open() );
+            fileofs loc = 0;
+            while ( loc < len ) {
+                f.write( loc , b.buf() , BLKSZ );
+                loc += BLKSZ;
+                m.hit(BLKSZ);
+            }
+            assert( loc == len );
+            f.fsync();
+        }
+
+        const int NUM_PREALLOC_FILES = 3;
+        inline boost::filesystem::path preallocPath(int n) {
+            assert(n >= 0);
+            assert(n < NUM_PREALLOC_FILES);
+            string fn = str::stream() << "prealloc." << n;
+            return getJournalDir() / fn;
+        }
+
+        // throws
+        void _preallocateFiles() {
+            for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
+                boost::filesystem::path filepath = preallocPath(i);
+
+                unsigned long long limit = DataLimitPerJournalFile;
+                if( debug && i == 1 ) { 
+                    // moving 32->64, the prealloc files would be short.  that is "ok", but we want to exercise that 
+                    // case, so we force exercising here when _DEBUG is set by arbitrarily stopping prealloc at a low 
+                    // limit for a file.  also we want to be able to change in the future the constant without a lot of
+                    // work anyway.
+                    limit = 16 * 1024 * 1024;
+                }
+                preallocateFile(filepath, limit);
+            }
+        }
+
+        void checkFreeSpace() {
+            unsigned long long spaceNeeded = static_cast<unsigned long long>(3 * DataLimitPerJournalFile * 1.1); // add 10% for headroom
+            unsigned long long freeSpace = File::freeSpace(getJournalDir().string());
+            unsigned long long prealloced = 0;
+            for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
+                boost::filesystem::path filepath = preallocPath(i);
+                if (exists(filepath))
+                    prealloced += file_size(filepath);
+            }
+
+            if (freeSpace + prealloced < spaceNeeded) {
+                log() << endl;
+                error() << "Insufficient free space for journals." << endl;
+                log() << "Please make at least " << spaceNeeded/(1024*1024) << "MB available in " << getJournalDir().string() << endl;
+                log() << endl;
+                throw UserException(15926, "Insufficient free space for journals");
+            }
+        }
+
+        void preallocateFiles() {
+            if (! (cmdLine.durOptions & CmdLine::DurNoCheckSpace))
+                checkFreeSpace();
+
+            if( exists(preallocPath(0)) || // if enabled previously, keep using
+                exists(preallocPath(1)) ||
+                ( cmdLine.preallocj && preallocateIsFaster() ) ) {
+                    usingPreallocate = true;
+                    try {
+                        _preallocateFiles();
+                    }
+                    catch(...) {
+                        log() << "warning caught exception in preallocateFiles, continuing" << endl;
+                    }
+            }
+            j.open();
+        }
+
+        void removeOldJournalFile(path p) { 
+            if( usingPreallocate ) {
+                try {
+                    for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
+                        boost::filesystem::path filepath = preallocPath(i);
+                        if( !boost::filesystem::exists(filepath) ) {
+                            // we can recycle this file into this prealloc file location
+                            boost::filesystem::path temppath = filepath.string() + ".temp";
+                            boost::filesystem::rename(p, temppath);
+                            {
+                                // zero the header
+                                File f;
+                                f.open(temppath.string().c_str(), false, false);
+                                char buf[8192];
+                                memset(buf, 0, 8192);
+                                f.write(0, buf, 8192);
+                                f.truncate(DataLimitPerJournalFile);
+                                f.fsync();
+                            }
+                            boost::filesystem::rename(temppath, filepath);
+                            return;
+                        }
+                    }
+                } catch(...) { 
+                    log() << "warning exception in dur::removeOldJournalFile " << p.string() << endl;
+                    // fall through and try to delete the file
+                }
+            }
+
+            // already have 3 prealloc files, so delete this file
+            try {
+                boost::filesystem::remove(p);
+            }
+            catch(...) { 
+                log() << "warning exception removing " << p.string() << endl;
+            }
+        }
+
+        // find a prealloc.<n> file, presumably to take and use
+        path findPrealloced() { 
+            try {
+                for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) {
+                    boost::filesystem::path filepath = preallocPath(i);
+                    if( boost::filesystem::exists(filepath) )
+                        return filepath;
+                }
+            } catch(...) { 
+                log() << "warning exception in dur::findPrealloced()" << endl;
+            }
+            return path();
+        }
+
+        /** assure journal/ dir exists. throws. call during startup. */
+        void journalMakeDir() {
+            j.init();
+
+            boost::filesystem::path p = getJournalDir();
+            j.dir = p.string();
+            log() << "journal dir=" << j.dir << endl;
+            if( !exists(j.dir) ) {
+                try {
+                    create_directory(j.dir);
+                }
+                catch(std::exception& e) {
+                    log() << "error creating directory " << j.dir << ' ' << e.what() << endl;
+                    throw;
+                }
+            }
+        }
+
+        void Journal::_open() {
+            _curFileId = 0;
+            assert( _curLogFile == 0 );
+            path fname = getFilePathFor(_nextFileNumber);
+
+            // if we have a prealloced file, use it 
+            {
+                path p = findPrealloced();
+                if( !p.empty() ) { 
+                    try { 
+                        {
+                            // JHeader::fileId must be updated before renaming to be race-safe
+                            LogFile f(p.string());
+                            JHeader h(p.string());
+                            AlignedBuilder b(8192);
+                            b.appendStruct(h);
+                            f.synchronousAppend(b.buf(), b.len());
+                        }
+                        boost::filesystem::rename(p, fname);
+                    }
+                    catch(...) { 
+                        log() << "warning couldn't write to / rename file " << p.string() << endl;
+                    }
+                }
+            }
+
+            _curLogFile = new LogFile(fname.string());
+            _nextFileNumber++;
+            {
+                JHeader h(fname.string());
+                _curFileId = h.fileId;
+                assert(_curFileId);
+                AlignedBuilder b(8192);
+                b.appendStruct(h);
+                _curLogFile->synchronousAppend(b.buf(), b.len());
+            }
+        }
+
+        void Journal::init() {
+            assert( _curLogFile == 0 );
+            MongoFile::notifyPreFlush = preFlush;
+            MongoFile::notifyPostFlush = postFlush;
+        }
+
+        void Journal::open() {
+            assert( MongoFile::notifyPreFlush == preFlush );
+            SimpleMutex::scoped_lock lk(_curLogFileMutex);
+            _open();
+        }
+
+        void LSNFile::set(unsigned long long x) {
+            memset(this, 0, sizeof(*this));
+            lsn = x;
+            checkbytes = ~x;
+        }
+
+        /** logs details of the situation, and returns 0, if anything surprising in the LSNFile
+            if something highly surprising, throws to abort
+        */
+        unsigned long long LSNFile::get() {
+            uassert(13614, str::stream() << "unexpected version number of lsn file in journal/ directory got: " << ver , ver == 0);
+            if( ~lsn != checkbytes ) {
+                log() << "lsnfile not valid. recovery will be from log start. lsn: " << hex << lsn << " checkbytes: " << hex << checkbytes << endl;
+                return 0;
+            }
+            return lsn;
+        }
+
+        /** called during recovery (the error message text below assumes that)
+        */
+        unsigned long long journalReadLSN() {
+            if( !MemoryMappedFile::exists(lsnPath()) ) {
+                log() << "info no lsn file in journal/ directory" << endl;
+                return 0;
+            }
+
+            try {
+                // os can flush as it likes.  if it flushes slowly, we will just do extra work on recovery.
+                // however, given we actually close the file when writing, that seems unlikely.
+                LSNFile L;
+                File f;
+                f.open(lsnPath().string().c_str());
+                assert(f.is_open());
+                if( f.len() == 0 ) { 
+                    // this could be 'normal' if we crashed at the right moment
+                    log() << "info lsn file is zero bytes long" << endl;
+                    return 0;
+                }
+                f.read(0,(char*)&L, sizeof(L));
+                unsigned long long lsn = L.get();
+                return lsn;
+            }
+            catch(std::exception& e) {
+                uasserted(13611, str::stream() << "can't read lsn file in journal directory : " << e.what());
+            }
+            return 0;
+        }
+
+        unsigned long long getLastDataFileFlushTime() {
+            return j.lastFlushTime();
+        }
+
+        /** remember "last sequence number" to speed recoveries
+            concurrency: called by durThread only.
+        */
+        void Journal::updateLSNFile() {
+            RACECHECK
+            if( !_writeToLSNNeeded )
+                return;
+            _writeToLSNNeeded = false;
+            try {
+                // os can flush as it likes.  if it flushes slowly, we will just do extra work on recovery.
+                // however, given we actually close the file, that seems unlikely.
+                File f;
+                f.open(lsnPath().string().c_str());
+                if( !f.is_open() ) { 
+                    // can get 0 if an i/o error
+                    log() << "warning: open of lsn file failed" << endl;
+                    return;
+                }
+                LOG(1) << "lsn set " << _lastFlushTime << endl;
+                LSNFile lsnf;
+                lsnf.set(_lastFlushTime);
+                f.write(0, (char*)&lsnf, sizeof(lsnf));
+				// do we want to fsync here? if we do it probably needs to be async so the durthread
+				// is not delayed.
+            }
+            catch(std::exception& e) {
+                log() << "warning: write to lsn file failed " << e.what() << endl;
+                // keep running (ignore the error). recovery will be slow.
+            }
+        }
+
+        void Journal::preFlush() {
+            j._preFlushTime = Listener::getElapsedTimeMillis();
+        }
+
+        void Journal::postFlush() {
+            j._lastFlushTime = j._preFlushTime;
+            j._writeToLSNNeeded = true;
+        }
+
+        // call from within _curLogFileMutex
+        void Journal::closeCurrentJournalFile() {
+            if (!_curLogFile)
+                return;
+
+            JFile jf;
+            jf.filename = _curLogFile->_name;
+            jf.lastEventTimeMs = Listener::getElapsedTimeMillis();
+            _oldJournalFiles.push_back(jf);
+
+            delete _curLogFile; // close
+            _curLogFile = 0;
+            _written = 0;
+        }
+
+        /** remove older journal files.
+            be in _curLogFileMutex but not dbMutex when calling
+        */
+        void Journal::removeUnneededJournalFiles() {
+            while( !_oldJournalFiles.empty() ) {
+                JFile f = _oldJournalFiles.front();
+
+                if( f.lastEventTimeMs < _lastFlushTime + ExtraKeepTimeMs ) {
+                    // eligible for deletion
+                    path p( f.filename );
+                    log() << "old journal file will be removed: " << f.filename << endl;
+                    removeOldJournalFile(p);
+                }
+                else {
+                    break;
+                }
+
+                _oldJournalFiles.pop_front();
+            }
+        }
+
+        /*int getAgeOutJournalFiles() {
+            mutex::try_lock lk(j._curLogFileMutex, 4000);
+            if( !lk.ok )
+                return -1;
+            return j._ageOut ? 1 : 0;
+        }*/
+        void setAgeOutJournalFiles(bool a) {
+            SimpleMutex::scoped_lock lk(j._curLogFileMutex);
+            j._ageOut = a;
+        }
+
+        void Journal::_rotate() {
+            if( d.dbMutex.atLeastReadLocked() ) { 
+                LOGSOME << "info journal _rotate called insider dbMutex - ok but should be somewhat rare" << endl;
+            }
+
+            RACECHECK;
+
+            _curLogFileMutex.dassertLocked();
+
+            if ( inShutdown() || !_curLogFile )
+                return;
+
+            j.updateLSNFile();
+
+            if( _curLogFile && _written < DataLimitPerJournalFile )
+                return;
+
+            if( _curLogFile ) {
+                _curLogFile->truncate();
+                closeCurrentJournalFile();
+                removeUnneededJournalFiles();
+            }
+
+            try {
+                Timer t;
+                _open();
+                int ms = t.millis();
+                if( ms >= 200 ) {
+                    log() << "DR101 latency warning on journal file open " << ms << "ms" << endl;
+                }
+            }
+            catch(std::exception& e) {
+                log() << "warning exception opening journal file " << e.what() << endl;
+                throw;
+            }
+        }
+
+        /** write (append) the buffer we have built to the journal and fsync it.
+            outside of dbMutex lock as this could be slow.
+            @param uncompressed - a buffer that will be written to the journal after compression
+            will not return until on disk
+        */
+        void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed) {
+            Timer t;
+            j.journal(h, uncompressed);
+            stats.curr->_writeToJournalMicros += t.micros();
+        }
+        void Journal::journal(const JSectHeader& h, const AlignedBuilder& uncompressed) {
+            RACECHECK
+            static AlignedBuilder b(32*1024*1024);
+            /* buffer to journal will be
+               JSectHeader
+               compressed operations
+               JSectFooter
+            */
+            const unsigned headTailSize = sizeof(JSectHeader) + sizeof(JSectFooter);
+            const unsigned max = maxCompressedLength(uncompressed.len()) + headTailSize;
+            b.reset(max);
+
+            {
+                dassert( h.sectionLen() == (unsigned) 0xffffffff ); // we will backfill later
+                b.appendStruct(h);
+            }
+
+            size_t compressedLength = 0;
+            rawCompress(uncompressed.buf(), uncompressed.len(), b.cur(), &compressedLength);
+            assert( compressedLength < 0xffffffff );
+            assert( compressedLength < max );
+            b.skip(compressedLength);
+
+            // footer
+            unsigned L = 0xffffffff;
+            {
+                // pad to alignment, and set the total section length in the JSectHeader
+                assert( 0xffffe000 == (~(Alignment-1)) );
+                unsigned lenUnpadded = b.len() + sizeof(JSectFooter);
+                L = (lenUnpadded + Alignment-1) & (~(Alignment-1));
+                dassert( L >= lenUnpadded );
+
+                ((JSectHeader*)b.atOfs(0))->setSectionLen(lenUnpadded);
+
+                JSectFooter f(b.buf(), b.len()); // computes checksum
+                b.appendStruct(f);
+                dassert( b.len() == lenUnpadded );
+
+                b.skip(L - lenUnpadded);
+                dassert( b.len() % Alignment == 0 );
+            }
+
+            try {
+                SimpleMutex::scoped_lock lk(_curLogFileMutex);
+
+                // must already be open -- so that _curFileId is correct for previous buffer building
+                assert( _curLogFile );
+
+                stats.curr->_uncompressedBytes += b.len();
+                unsigned w = b.len();
+                _written += w;
+                assert( w <= L );
+                stats.curr->_journaledBytes += L;
+                _curLogFile->synchronousAppend((const void *) b.buf(), L);
+                _rotate();
+            }
+            catch(std::exception& e) {
+                log() << "error exception in dur::journal " << e.what() << endl;
+                throw;
+            }
+        }
+
+    }
+}
+
+/* todo
+   test (and handle) disk full on journal append.  best quick thing to do is to terminate.
+   if we roll back operations, there are nuances such as is ReplSetImpl::lastOpTimeWritten too new in ram then?
+*/
diff --git a/src/mongo/db/dur_journal.h b/src/mongo/db/dur_journal.h
new file mode 100644
index 00000000000..664f63942e0
--- /dev/null
+++ b/src/mongo/db/dur_journal.h
@@ -0,0 +1,68 @@
+// @file dur_journal.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+    class AlignedBuilder;
+
+    namespace dur {
+
+        /** true if ok to cleanup journal files at termination. otherwise, files journal will be retained.
+        */
+        extern bool okToCleanUp;
+
+        /** at termination after db files closed & fsynced 
+            also after recovery
+            closes and removes journal files
+            @param log report in log that we are cleaning up if we actually do any work
+        */
+        void journalCleanup(bool log = false);
+
+        /** assure journal/ dir exists. throws */
+        void journalMakeDir();
+
+        /** check if time to rotate files; assure a file is open.
+             done separately from the journal() call as we can do this part
+             outside of lock.
+            only called by durThread.
+         */
+        void journalRotate();
+
+        /** flag that something has gone wrong during writing to the journal
+            (not for recovery mode)
+        */
+        void journalingFailure(const char *msg);
+
+        /** read lsn from disk from the last run before doing recovery */
+        unsigned long long journalReadLSN();
+
+        unsigned long long getLastDataFileFlushTime();
+
+        /** never throws.
+            @return true if there are any journal files in the journal dir.
+        */
+        bool haveJournalFiles();
+
+        // in case disk controller buffers writes
+        const long long ExtraKeepTimeMs = 10000;
+
+        const unsigned JournalCommitIntervalDefault = 100;
+
+    }
+}
diff --git a/src/mongo/db/dur_journalformat.h b/src/mongo/db/dur_journalformat.h
new file mode 100644
index 00000000000..10ed8487b71
--- /dev/null
+++ b/src/mongo/db/dur_journalformat.h
@@ -0,0 +1,174 @@
+// @file dur_journalformat.h The format of our journal files.
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    namespace dur {
+
+        const unsigned Alignment = 8192;
+
+#pragma pack(1)
+        /** beginning header for a journal/j._<n> file
+            there is nothing important int this header at this time.  except perhaps version #.
+        */
+        struct JHeader {
+            JHeader() { }
+            JHeader(string fname);
+
+            char magic[2]; // "j\n". j means journal, then a linefeed, fwiw if you were to run "less" on the file or something...
+
+            // x4142 is asci--readable if you look at the file with head/less -- thus the starting values were near
+            // that.  simply incrementing the version # is safe on a fwd basis.
+#if defined(_NOCOMPRESS)
+            enum { CurrentVersion = 0x4148 };
+#else
+            enum { CurrentVersion = 0x4149 };
+#endif
+            unsigned short _version;
+
+            // these are just for diagnostic ease (make header more useful as plain text)
+            char n1;          // '\n'
+            char ts[20];      // ascii timestamp of file generation.  for user reading, not used by code.
+            char n2;          // '\n'
+            char dbpath[128]; // path/filename of this file for human reading and diagnostics.  not used by code.
+            char n3, n4;      // '\n', '\n'
+
+            unsigned long long fileId; // unique identifier that will be in each JSectHeader. important as we recycle prealloced files
+
+            char reserved3[8026]; // 8KB total for the file header
+            char txt2[2];         // "\n\n" at the end
+
+            bool versionOk() const { return _version == CurrentVersion; }
+            bool valid() const { return magic[0] == 'j' && txt2[1] == '\n' && fileId; }
+        };
+
+        /** "Section" header.  A section corresponds to a group commit.
+            len is length of the entire section including header and footer.
+            header and footer are not compressed, just the stuff in between.
+        */
+        struct JSectHeader {
+        private:
+            unsigned _sectionLen;          // unpadded length in bytes of the whole section
+        public:
+            unsigned long long seqNumber;  // sequence number that can be used on recovery to not do too much work
+            unsigned long long fileId;     // matches JHeader::fileId
+            unsigned sectionLen() const { return _sectionLen; }
+
+            // we store the unpadded length so we can use that when we uncompress. to 
+            // get the true total size this must be rounded up to the Alignment.
+            void setSectionLen(unsigned lenUnpadded) { _sectionLen = lenUnpadded; }
+
+            unsigned sectionLenWithPadding() const { 
+                unsigned x = (sectionLen() + (Alignment-1)) & (~(Alignment-1));
+                dassert( x % Alignment == 0 );
+                return x;
+            }
+        };
+
+        /** an individual write operation within a group commit section.  Either the entire section should
+            be applied, or nothing.  (We check the md5 for the whole section before doing anything on recovery.)
+        */
+        struct JEntry {
+            enum OpCodes {
+                OpCode_Footer      = 0xffffffff,
+                OpCode_DbContext   = 0xfffffffe,
+                OpCode_FileCreated = 0xfffffffd,
+                OpCode_DropDb      = 0xfffffffc,
+                OpCode_Min         = 0xfffff000
+            };
+            union {
+                unsigned len;    // length in bytes of the data of the JEntry. does not include the JEntry header
+                OpCodes opcode;
+            };
+
+            unsigned ofs;  // offset in file
+
+            // sentinel and masks for _fileNo
+            enum {
+                DotNsSuffix = 0x7fffffff, // ".ns" file
+                LocalDbBit  = 0x80000000  // assuming "local" db instead of using the JDbContext
+            };
+            int _fileNo;   // high bit is set to indicate it should be the <dbpath>/local database
+            // char data[len] follows
+
+            const char * srcData() const {
+                const int *i = &_fileNo;
+                return (const char *) (i+1);
+            }
+
+            int getFileNo() const { return _fileNo & (~LocalDbBit); }
+            void setFileNo(int f) { _fileNo = f; }
+            bool isNsSuffix() const { return getFileNo() == DotNsSuffix; }
+
+            void setLocalDbContextBit() { _fileNo |= LocalDbBit; }
+            bool isLocalDbContext() const { return _fileNo & LocalDbBit; }
+            void clearLocalDbContextBit() { _fileNo = getFileNo(); }
+
+            static string suffix(int fileno) {
+                if( fileno == DotNsSuffix ) return "ns";
+                stringstream ss;
+                ss << fileno;
+                return ss.str();
+            }
+        };
+
+        /** group commit section footer. md5 is a key field. */
+        struct JSectFooter {
+            JSectFooter();
+            JSectFooter(const void* begin, int len); // needs buffer to compute hash
+            unsigned sentinel;
+            unsigned char hash[16];
+            unsigned long long reserved;
+            char magic[4]; // "\n\n\n\n"
+
+            /** used by recovery to see if buffer is valid
+                @param begin the buffer
+                @param len buffer len
+                @return true if buffer looks valid
+            */
+            bool checkHash(const void* begin, int len) const;
+
+            bool magicOk() const { return *((unsigned*)magic) == 0x0a0a0a0a; }
+        };
+
+        /** declares "the next entry(s) are for this database / file path prefix" */
+        struct JDbContext {
+            JDbContext() : sentinel(JEntry::OpCode_DbContext) { }
+            const unsigned sentinel;   // compare to JEntry::len -- zero is our sentinel
+            //char dbname[];
+        };
+
+        /** "last sequence number" */
+        struct LSNFile {
+            unsigned ver;
+            unsigned reserved2;
+            unsigned long long lsn;
+            unsigned long long checkbytes;
+            unsigned long long reserved[8];
+
+            void set(unsigned long long lsn);
+            unsigned long long get();
+        };
+
+#pragma pack()
+
+    }
+
+}
diff --git a/src/mongo/db/dur_journalimpl.h b/src/mongo/db/dur_journalimpl.h
new file mode 100644
index 00000000000..8aad70b0e5c
--- /dev/null
+++ b/src/mongo/db/dur_journalimpl.h
@@ -0,0 +1,103 @@
+// @file dur_journal.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "dur_journalformat.h"
+#include "../util/logfile.h"
+
+namespace mongo {
+    namespace dur {
+
+        /** the writeahead journal for durability */
+        class Journal {
+        public:
+            string dir; // set by journalMakeDir() during initialization
+
+            Journal();
+
+            /** call during startup by journalMakeDir() */
+            void init();
+
+            /** check if time to rotate files.  assure a file is open.
+                done separately from the journal() call as we can do this part
+                outside of lock.
+                thread: durThread()
+             */
+            void rotate();
+
+            /** append to the journal file
+            */
+            void journal(const JSectHeader& h, const AlignedBuilder& b);
+
+            boost::filesystem::path getFilePathFor(int filenumber) const;
+
+            unsigned long long lastFlushTime() const { return _lastFlushTime; }
+            void cleanup(bool log); // closes and removes journal files
+
+            unsigned long long curFileId() const { return _curFileId; }
+
+            void assureLogFileOpen() {
+                SimpleMutex::scoped_lock lk(_curLogFileMutex);
+                if( _curLogFile == 0 )
+                    _open();
+            }
+
+            /** open a journal file to journal operations to. */
+            void open();
+
+        private:
+            /** check if time to rotate files.  assure a file is open.
+             *  internally called with every commit
+             */
+            void _rotate();
+
+            void _open();
+            void closeCurrentJournalFile();
+            void removeUnneededJournalFiles();
+
+            unsigned long long _written; // bytes written so far to the current journal (log) file
+            unsigned _nextFileNumber;
+        public:
+            SimpleMutex _curLogFileMutex;
+            bool _ageOut;
+        private:
+
+            LogFile *_curLogFile; // use _curLogFileMutex
+            unsigned long long _curFileId; // current file id see JHeader::fileId
+
+            struct JFile {
+                string filename;
+                unsigned long long lastEventTimeMs;
+            };
+
+            // files which have been closed but not unlinked (rotated out) yet
+            // ordered oldest to newest
+            list<JFile> _oldJournalFiles; // use _curLogFileMutex
+
+            // lsn related
+            static void preFlush();
+            static void postFlush();
+            unsigned long long _preFlushTime;
+            unsigned long long _lastFlushTime; // data < this time is fsynced in the datafiles (unless hard drive controller is caching)
+            bool _writeToLSNNeeded;
+            void updateLSNFile();
+        };
+
+    }
+}
diff --git a/src/mongo/db/dur_preplogbuffer.cpp b/src/mongo/db/dur_preplogbuffer.cpp
new file mode 100644
index 00000000000..10b63c0e549
--- /dev/null
+++ b/src/mongo/db/dur_preplogbuffer.cpp
@@ -0,0 +1,177 @@
+// @file dur_preplogbuffer.cpp
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+     PREPLOGBUFFER
+       we will build an output buffer ourself and then use O_DIRECT
+       we could be in read lock for this
+       for very large objects write directly to redo log in situ?
+     @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "dur.h"
+#include "dur_journal.h"
+#include "dur_journalimpl.h"
+#include "dur_commitjob.h"
+#include "../util/mongoutils/hash.h"
+#include "../util/mongoutils/str.h"
+#include "../util/alignedbuilder.h"
+#include "../util/timer.h"
+#include "dur_stats.h"
+#include "../server.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+    namespace dur {
+
+        extern Journal j;
+
+        RelativePath local = RelativePath::fromRelativePath("local");
+
+        static MongoMMF* findMMF_inlock(void *ptr, size_t &ofs) {
+            MongoMMF *f = privateViews.find_inlock(ptr, ofs);
+            if( f == 0 ) {
+                error() << "findMMF_inlock failed " << privateViews.numberOfViews_inlock() << endl;
+                printStackTrace(); // we want a stack trace and the assert below didn't print a trace once in the real world - not sure why
+                stringstream ss;
+                ss << "view pointer cannot be resolved " << hex << (size_t) ptr;
+                journalingFailure(ss.str().c_str()); // asserts, which then abends
+            }
+            return f;
+        }
+
+        /** put the basic write operation into the buffer (bb) to be journaled */
+        static void prepBasicWrite_inlock(AlignedBuilder&bb, const WriteIntent *i, RelativePath& lastDbPath) {
+            size_t ofs = 1;
+            MongoMMF *mmf = findMMF_inlock(i->start(), /*out*/ofs);
+
+            if( unlikely(!mmf->willNeedRemap()) ) {
+                // tag this mmf as needed a remap of its private view later.
+                // usually it will already be dirty/already set, so we do the if above first
+                // to avoid possibility of cpu cache line contention
+                mmf->willNeedRemap() = true;
+            }
+
+            // since we have already looked up the mmf, we go ahead and remember the write view location
+            // so we don't have to find the MongoMMF again later in WRITETODATAFILES()
+            // 
+            // this was for WRITETODATAFILES_Impl2 so commented out now
+            //
+            /*
+            dassert( i->w_ptr == 0 );
+            i->w_ptr = ((char*)mmf->view_write()) + ofs;
+            */
+
+            JEntry e;
+            e.len = min(i->length(), (unsigned)(mmf->length() - ofs)); //dont write past end of file
+            assert( ofs <= 0x80000000 );
+            e.ofs = (unsigned) ofs;
+            e.setFileNo( mmf->fileSuffixNo() );
+            if( mmf->relativePath() == local ) {
+                e.setLocalDbContextBit();
+            }
+            else if( mmf->relativePath() != lastDbPath ) {
+                lastDbPath = mmf->relativePath();
+                JDbContext c;
+                bb.appendStruct(c);
+                bb.appendStr(lastDbPath.toString());
+            }
+            bb.appendStruct(e);
+#if defined(_EXPERIMENTAL)
+            i->ofsInJournalBuffer = bb.len();
+#endif
+            bb.appendBuf(i->start(), e.len);
+
+            if (unlikely(e.len != (unsigned)i->length())) {
+                log() << "journal info splitting prepBasicWrite at boundary" << endl;
+
+                // This only happens if we write to the last byte in a file and
+                // the fist byte in another file that is mapped adjacently. I
+                // think most OSs leave at least a one page gap between
+                // mappings, but better to be safe.
+
+                WriteIntent next ((char*)i->start() + e.len, i->length() - e.len);
+                prepBasicWrite_inlock(bb, &next, lastDbPath);
+            }
+        }
+
+        /** basic write ops / write intents.  note there is no particular order to these : if we have
+            two writes to the same location during the group commit interval, it is likely
+            (although not assured) that it is journaled here once.
+        */
+        static void prepBasicWrites(AlignedBuilder& bb) {
+            scoped_lock lk(privateViews._mutex());
+
+            // each time events switch to a different database we journal a JDbContext
+            RelativePath lastDbPath;
+
+            for( set<WriteIntent>::iterator i = commitJob.writes().begin(); i != commitJob.writes().end(); i++ ) {
+                prepBasicWrite_inlock(bb, &(*i), lastDbPath);
+            }
+        }
+
+        static void resetLogBuffer(/*out*/JSectHeader& h, AlignedBuilder& bb) {
+            bb.reset();
+
+            h.setSectionLen(0xffffffff);  // total length, will fill in later
+            h.seqNumber = getLastDataFileFlushTime();
+            h.fileId = j.curFileId();
+        }
+
+        /** we will build an output buffer ourself and then use O_DIRECT
+            we could be in read lock for this
+            caller handles locking
+            @return partially populated sectheader and _ab set
+        */
+        static void _PREPLOGBUFFER(JSectHeader& h) {
+            assert( cmdLine.dur );
+
+            {
+                // now that we are locked, fully drain deferred notes of write intents
+                DEV d.dbMutex.assertAtLeastReadLocked();
+                Writes& writes = commitJob.wi();
+                writes._deferred.invoke();
+                writes._drained = true;
+            }
+
+            AlignedBuilder& bb = commitJob._ab;
+            resetLogBuffer(h, bb); // adds JSectHeader
+
+            // ops other than basic writes (DurOp's)
+            {
+                for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) {
+                    (*i)->serialize(bb);
+                }
+            }
+
+            prepBasicWrites(bb);
+
+            return;
+        }
+        void PREPLOGBUFFER(/*out*/ JSectHeader& h) {
+            Timer t;
+            j.assureLogFileOpen(); // so fileId is set
+            _PREPLOGBUFFER(h);
+            stats.curr->_prepLogBufferMicros += t.micros();
+        }
+
+    }
+}
diff --git a/src/mongo/db/dur_recover.cpp b/src/mongo/db/dur_recover.cpp
new file mode 100644
index 00000000000..a0a8843572c
--- /dev/null
+++ b/src/mongo/db/dur_recover.cpp
@@ -0,0 +1,544 @@
+// @file dur_recover.cpp crash recovery via the journal
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "dur.h"
+#include "dur_stats.h"
+#include "dur_recover.h"
+#include "dur_journal.h"
+#include "dur_journalformat.h"
+#include "durop.h"
+#include "namespace.h"
+#include "../util/mongoutils/str.h"
+#include "../util/bufreader.h"
+#include "../util/concurrency/race.h"
+#include "pdfile.h"
+#include "database.h"
+#include "db.h"
+#include "../util/unittest.h"
+#include "../util/checksum.h"
+#include "cmdline.h"
+#include "curop.h"
+#include "mongommf.h"
+#include "../util/compress.h"
+
+#include <sys/stat.h>
+#include <fcntl.h>
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    namespace dur {
+
+        struct ParsedJournalEntry { /*copyable*/
+            ParsedJournalEntry() : e(0) { }
+
+            // relative path of database for the operation.
+            // might be a pointer into mmaped Journal file
+            const char *dbName;
+
+            // thse are pointers into the memory mapped journal file
+            const JEntry *e;  // local db sentinel is already parsed out here into dbName
+
+            // if not one of the two simple JEntry's above, this is the operation:
+            shared_ptr<DurOp> op;
+        };
+
+        void removeJournalFiles();
+        path getJournalDir();
+
+        /** get journal filenames, in order. throws if unexpected content found */
+        static void getFiles(path dir, vector<path>& files) {
+            map<unsigned,path> m;
+            for ( boost::filesystem::directory_iterator i( dir );
+                    i != boost::filesystem::directory_iterator();
+                    ++i ) {
+                boost::filesystem::path filepath = *i;
+                string fileName = boost::filesystem::path(*i).leaf();
+                if( str::startsWith(fileName, "j._") ) {
+                    unsigned u = str::toUnsigned( str::after(fileName, '_') );
+                    if( m.count(u) ) {
+                        uasserted(13531, str::stream() << "unexpected files in journal directory " << dir.string() << " : " << fileName);
+                    }
+                    m.insert( pair<unsigned,path>(u,filepath) );
+                }
+            }
+            for( map<unsigned,path>::iterator i = m.begin(); i != m.end(); ++i ) {
+                if( i != m.begin() && m.count(i->first - 1) == 0 ) {
+                    uasserted(13532,
+                    str::stream() << "unexpected file in journal directory " << dir.string()
+                      << " : " << boost::filesystem::path(i->second).leaf() << " : can't find its preceeding file");
+                }
+                files.push_back(i->second);
+            }
+        }
+
+        /** read through the memory mapped data of a journal file (journal/j._<n> file)
+            throws
+        */
+        class JournalSectionIterator : boost::noncopyable {
+            auto_ptr<BufReader> _entries;
+            const JSectHeader _h;
+            const char *_lastDbName; // pointer into mmaped journal file
+            const bool _doDurOps;
+            string _uncompressed;
+        public:
+            JournalSectionIterator(const JSectHeader& h, const void *compressed, unsigned compressedLen, bool doDurOpsRecovering) :
+                _h(h),
+                _lastDbName(0)
+                , _doDurOps(doDurOpsRecovering)
+            {
+                assert( doDurOpsRecovering );
+                bool ok = uncompress((const char *)compressed, compressedLen, &_uncompressed);
+                if( !ok ) { 
+                    // it should always be ok (i think?) as there is a previous check to see that the JSectFooter is ok
+                    log() << "couldn't uncompress journal section" << endl;
+                    msgasserted(15874, "couldn't uncompress journal section");
+                }
+                const char *p = _uncompressed.c_str();
+                assert( compressedLen == _h.sectionLen() - sizeof(JSectFooter) - sizeof(JSectHeader) );
+                _entries = auto_ptr<BufReader>( new BufReader(p, _uncompressed.size()) );
+            }
+
+            // we work with the uncompressed buffer when doing a WRITETODATAFILES (for speed)
+            JournalSectionIterator(const JSectHeader &h, const void *p, unsigned len) :
+                _entries( new BufReader((const char *) p, len) ),
+                _h(h),
+                _lastDbName(0)
+                , _doDurOps(false)
+
+                { }
+
+            bool atEof() const { return _entries->atEof(); }
+
+            unsigned long long seqNumber() const { return _h.seqNumber; }
+
+            /** get the next entry from the log.  this function parses and combines JDbContext and JEntry's.
+             *  throws on premature end of section.
+             */
+            void next(ParsedJournalEntry& e) {
+                unsigned lenOrOpCode;
+                _entries->read(lenOrOpCode);
+
+                if (lenOrOpCode > JEntry::OpCode_Min) {
+                    switch( lenOrOpCode ) {
+
+                    case JEntry::OpCode_Footer: {
+                        assert( false );
+                    }
+
+                    case JEntry::OpCode_FileCreated:
+                    case JEntry::OpCode_DropDb: {
+                        e.dbName = 0;
+                        boost::shared_ptr<DurOp> op = DurOp::read(lenOrOpCode, *_entries);
+                        if (_doDurOps) {
+                            e.op = op;
+                        }
+                        return;
+                    }
+
+                    case JEntry::OpCode_DbContext: {
+                        _lastDbName = (const char*) _entries->pos();
+                        const unsigned limit = std::min((unsigned)Namespace::MaxNsLen, _entries->remaining());
+                        const unsigned len = strnlen(_lastDbName, limit);
+                        massert(13533, "problem processing journal file during recovery", _lastDbName[len] == '\0');
+                        _entries->skip(len+1); // skip '\0' too
+                        _entries->read(lenOrOpCode); // read this for the fall through
+                    }
+                    // fall through as a basic operation always follows jdbcontext, and we don't have anything to return yet
+
+                    default:
+                        // fall through
+                        ;
+                    }
+                }
+
+                // JEntry - a basic write
+                assert( lenOrOpCode && lenOrOpCode < JEntry::OpCode_Min );
+                _entries->rewind(4);
+                e.e = (JEntry *) _entries->skip(sizeof(JEntry));
+                e.dbName = e.e->isLocalDbContext() ? "local" : _lastDbName;
+                assert( e.e->len == lenOrOpCode );
+                _entries->skip(e.e->len);
+            }
+
+        };
+
+        static string fileName(const char* dbName, int fileNo) {
+            stringstream ss;
+            ss << dbName << '.';
+            assert( fileNo >= 0 );
+            if( fileNo == JEntry::DotNsSuffix )
+                ss << "ns";
+            else
+                ss << fileNo;
+
+            // relative name -> full path name
+            path full(dbpath);
+            full /= ss.str();
+            return full.string();
+        }
+
+        RecoveryJob::~RecoveryJob() {
+            DESTRUCTOR_GUARD(
+                if( !_mmfs.empty() )
+                    close();
+            )
+        }
+
+        void RecoveryJob::close() {
+            scoped_lock lk(_mx);
+            _close();
+        }
+
+        void RecoveryJob::_close() {
+            MongoFile::flushAll(true);
+            _mmfs.clear();
+        }
+
+        void RecoveryJob::write(const ParsedJournalEntry& entry) {
+            //TODO(mathias): look into making some of these dasserts
+            assert(entry.e);
+            assert(entry.dbName);
+            assert(strnlen(entry.dbName, MaxDatabaseNameLen) < MaxDatabaseNameLen);
+
+            const string fn = fileName(entry.dbName, entry.e->getFileNo());
+            MongoFile* file;
+            {
+                MongoFileFinder finder; // must release lock before creating new MongoMMF
+                file = finder.findByPath(fn);
+            }
+
+            MongoMMF* mmf;
+            if (file) {
+                assert(file->isMongoMMF());
+                mmf = (MongoMMF*)file;
+            }
+            else {
+                if( !_recovering ) {
+                    log() << "journal error applying writes, file " << fn << " is not open" << endl;
+                    assert(false);
+                }
+                boost::shared_ptr<MongoMMF> sp (new MongoMMF);
+                assert(sp->open(fn, false));
+                _mmfs.push_back(sp);
+                mmf = sp.get();
+            }
+
+            if ((entry.e->ofs + entry.e->len) <= mmf->length()) {
+                assert(mmf->view_write());
+                assert(entry.e->srcData());
+
+                void* dest = (char*)mmf->view_write() + entry.e->ofs;
+                memcpy(dest, entry.e->srcData(), entry.e->len);
+                stats.curr->_writeToDataFilesBytes += entry.e->len;
+            }
+            else {
+                massert(13622, "Trying to write past end of file in WRITETODATAFILES", _recovering);
+            }
+        }
+
+        void RecoveryJob::applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump) {
+            if( entry.e ) {
+                if( dump ) {
+                    stringstream ss;
+                    ss << "  BASICWRITE " << setw(20) << entry.dbName << '.';
+                    if( entry.e->isNsSuffix() )
+                        ss << "ns";
+                    else
+                        ss << setw(2) << entry.e->getFileNo();
+                    ss << ' ' << setw(6) << entry.e->len << ' ' << /*hex << setw(8) << (size_t) fqe.srcData << dec <<*/
+                       "  " << hexdump(entry.e->srcData(), entry.e->len);
+                    log() << ss.str() << endl;
+                }
+                if( apply ) {
+                    write(entry);
+                }
+            }
+            else if(entry.op) {
+                // a DurOp subclass operation
+                if( dump ) {
+                    log() << "  OP " << entry.op->toString() << endl;
+                }
+                if( apply ) {
+                    if( entry.op->needFilesClosed() ) {
+                        _close(); // locked in processSection
+                    }
+                    entry.op->replay();
+                }
+            }
+        }
+
+        void RecoveryJob::applyEntries(const vector<ParsedJournalEntry> &entries) {
+            bool apply = (cmdLine.durOptions & CmdLine::DurScanOnly) == 0;
+            bool dump = cmdLine.durOptions & CmdLine::DurDumpJournal;
+            if( dump )
+                log() << "BEGIN section" << endl;
+
+            for( vector<ParsedJournalEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i ) {
+                applyEntry(*i, apply, dump);
+            }
+
+            if( dump )
+                log() << "END section" << endl;
+        }
+
+        void RecoveryJob::processSection(const JSectHeader *h, const void *p, unsigned len, const JSectFooter *f) {
+            scoped_lock lk(_mx);
+            RACECHECK
+
+            /** todo: we should really verify the checksum to see that seqNumber is ok?
+                      that is expensive maybe there is some sort of checksum of just the header 
+                      within the header itself
+            */
+            if( _recovering && _lastDataSyncedFromLastRun > h->seqNumber + ExtraKeepTimeMs ) {
+                if( h->seqNumber != _lastSeqMentionedInConsoleLog ) {
+                    static int n;
+                    if( ++n < 10 ) {
+                        log() << "recover skipping application of section seq:" << h->seqNumber << " < lsn:" << _lastDataSyncedFromLastRun << endl;
+                    }
+                    else if( n == 10 ) { 
+                        log() << "recover skipping application of section more..." << endl;
+                    }
+                    _lastSeqMentionedInConsoleLog = h->seqNumber;
+                }
+                return;
+            }
+
+            auto_ptr<JournalSectionIterator> i;
+            if( _recovering ) {
+                i = auto_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, p, len, _recovering));
+            }
+            else { 
+                i = auto_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, /*after header*/p, /*w/out header*/len));
+            }
+
+            // we use a static so that we don't have to reallocate every time through.  occasionally we 
+            // go back to a small allocation so that if there were a spiky growth it won't stick forever.
+            static vector<ParsedJournalEntry> entries;
+            entries.clear();
+/** TEMP uncomment
+            RARELY OCCASIONALLY {
+                if( entries.capacity() > 2048 ) {
+                    entries.shrink_to_fit();
+                    entries.reserve(2048);
+                }
+            }
+*/
+
+            // first read all entries to make sure this section is valid
+            ParsedJournalEntry e;
+            while( !i->atEof() ) {
+                i->next(e);
+                entries.push_back(e);
+            }
+
+            // after the entries check the footer checksum
+            if( _recovering ) {
+                assert( ((const char *)h) + sizeof(JSectHeader) == p );
+                if( !f->checkHash(h, len + sizeof(JSectHeader)) ) { 
+                    msgasserted(13594, "journal checksum doesn't match");
+                }
+            }
+
+            // got all the entries for one group commit.  apply them:
+            applyEntries(entries);
+        }
+
+        /** apply a specific journal file, that is already mmap'd
+            @param p start of the memory mapped file
+            @return true if this is detected to be the last file (ends abruptly)
+        */
+        bool RecoveryJob::processFileBuffer(const void *p, unsigned len) {
+            try {
+                unsigned long long fileId;
+                BufReader br(p,len);
+
+                {
+                    // read file header
+                    JHeader h;
+                    br.read(h);
+
+                    /* [dm] not automatically handled.  we should eventually handle this automatically.  i think:
+                       (1) if this is the final journal file
+                       (2) and the file size is just the file header in length (or less) -- this is a bit tricky to determine if prealloced
+                       then can just assume recovery ended cleanly and not error out (still should log).
+                    */
+                    uassert(13537, 
+                        "journal file header invalid. This could indicate corruption in a journal file, or perhaps a crash where sectors in file header were in flight written out of order at time of crash (unlikely but possible).", 
+                        h.valid());
+
+                    if( !h.versionOk() ) {
+                        log() << "journal file version number mismatch got:" << hex << h._version                             
+                            << " expected:" << hex << (unsigned) JHeader::CurrentVersion 
+                            << ". if you have just upgraded, recover with old version of mongod, terminate cleanly, then upgrade." 
+                            << endl;
+                        uasserted(13536, str::stream() << "journal version number mismatch " << h._version);
+                    }
+                    fileId = h.fileId;
+                    if(cmdLine.durOptions & CmdLine::DurDumpJournal) { 
+                        log() << "JHeader::fileId=" << fileId << endl;
+                    }
+                }
+
+                // read sections
+                while ( !br.atEof() ) {
+                    JSectHeader h;
+                    br.peek(h);
+                    if( h.fileId != fileId ) {
+                        if( debug || (cmdLine.durOptions & CmdLine::DurDumpJournal) ) {
+                            log() << "Ending processFileBuffer at differing fileId want:" << fileId << " got:" << h.fileId << endl;
+                            log() << "  sect len:" << h.sectionLen() << " seqnum:" << h.seqNumber << endl;
+                        }
+                        return true;
+                    }
+                    unsigned slen = h.sectionLen();
+                    unsigned dataLen = slen - sizeof(JSectHeader) - sizeof(JSectFooter);
+                    const char *hdr = (const char *) br.skip(h.sectionLenWithPadding());
+                    const char *data = hdr + sizeof(JSectHeader);
+                    const char *footer = data + dataLen;
+                    processSection((const JSectHeader*) hdr, data, dataLen, (const JSectFooter*) footer);
+
+                    // ctrl c check
+                    killCurrentOp.checkForInterrupt(false);
+                }
+            }
+            catch( BufReader::eof& ) {
+                if( cmdLine.durOptions & CmdLine::DurDumpJournal )
+                    log() << "ABRUPT END" << endl;
+                return true; // abrupt end
+            }
+
+            return false; // non-abrupt end
+        }
+
+        /** apply a specific journal file */
+        bool RecoveryJob::processFile(path journalfile) {
+            log() << "recover " << journalfile.string() << endl;
+
+            try { 
+                if( boost::filesystem::file_size( journalfile.string() ) == 0 ) {
+                    log() << "recover info " << journalfile.string() << " has zero length" << endl;
+                    return true;
+                }
+            } catch(...) { 
+                // if something weird like a permissions problem keep going so the massert down below can happen (presumably)
+                log() << "recover exception checking filesize" << endl;
+            }
+
+            MemoryMappedFile f;
+            void *p = f.mapWithOptions(journalfile.string().c_str(), MongoFile::READONLY | MongoFile::SEQUENTIAL);
+            massert(13544, str::stream() << "recover error couldn't open " << journalfile.string(), p);
+            return processFileBuffer(p, (unsigned) f.length());
+        }
+
+        /** @param files all the j._0 style files we need to apply for recovery */
+        void RecoveryJob::go(vector<path>& files) {
+            log() << "recover begin" << endl;
+            _recovering = true;
+
+            // load the last sequence number synced to the datafiles on disk before the last crash
+            _lastDataSyncedFromLastRun = journalReadLSN();
+            log() << "recover lsn: " << _lastDataSyncedFromLastRun << endl;
+
+            for( unsigned i = 0; i != files.size(); ++i ) {
+	      bool abruptEnd = processFile(files[i]);
+                if( abruptEnd && i+1 < files.size() ) {
+                    log() << "recover error: abrupt end to file " << files[i].string() << ", yet it isn't the last journal file" << endl;
+                    close();
+                    uasserted(13535, "recover abrupt journal file end");
+                }
+            }
+
+            close();
+
+            if( cmdLine.durOptions & CmdLine::DurScanOnly ) {
+                uasserted(13545, str::stream() << "--durOptions " << (int) CmdLine::DurScanOnly << " (scan only) specified");
+            }
+
+            log() << "recover cleaning up" << endl;
+            removeJournalFiles();
+            log() << "recover done" << endl;
+            okToCleanUp = true;
+            _recovering = false;
+        }
+
+        void _recover() {
+            assert( cmdLine.dur );
+
+            boost::filesystem::path p = getJournalDir();
+            if( !exists(p) ) {
+                log() << "directory " << p.string() << " does not exist, there will be no recovery startup step" << endl;
+                okToCleanUp = true;
+                return;
+            }
+
+            vector<path> journalFiles;
+            getFiles(p, journalFiles);
+
+            if( journalFiles.empty() ) {
+                log() << "recover : no journal files present, no recovery needed" << endl;
+                okToCleanUp = true;
+                return;
+            }
+
+            RecoveryJob::get().go(journalFiles);
+        }
+
+        extern mutex groupCommitMutex;
+
+        /** recover from a crash
+            called during startup
+            throws on error
+        */
+        void recover() {
+            // we use a lock so that exitCleanly will wait for us
+            // to finish (or at least to notice what is up and stop)
+            writelock lk;
+
+            // this is so the mutexdebugger doesn't get confused.  we are actually single threaded 
+            // at this point in the program so it wouldn't have been a true problem (I think)
+            scoped_lock lk2(groupCommitMutex);
+
+            _recover(); // throws on interruption
+        }
+
+        struct BufReaderY { int a,b; };
+        class BufReaderUnitTest : public UnitTest {
+        public:
+            void run() {
+                BufReader r((void*) "abcdabcdabcd", 12);
+                char x;
+                BufReaderY y;
+                r.read(x); //cout << x; // a
+                assert( x == 'a' );
+                r.read(y);
+                r.read(x);
+                assert( x == 'b' );
+            }
+        } brunittest;
+
+        // can't free at termination because order of destruction of global vars is arbitrary
+        RecoveryJob &RecoveryJob::_instance = *(new RecoveryJob());
+
+    } // namespace dur
+
+} // namespace mongo
+
diff --git a/src/mongo/db/dur_recover.h b/src/mongo/db/dur_recover.h
new file mode 100644
index 00000000000..955e730ea05
--- /dev/null
+++ b/src/mongo/db/dur_recover.h
@@ -0,0 +1,50 @@
+// @file dur.h durability support
+
+#pragma once
+
+#include "dur_journalformat.h"
+#include "../util/concurrency/mutex.h"
+#include "../util/file.h"
+
+namespace mongo {
+    class MongoMMF;
+
+    namespace dur {
+        struct ParsedJournalEntry;
+
+        /** call go() to execute a recovery from existing journal files.
+         */
+        class RecoveryJob : boost::noncopyable {
+        public:
+            RecoveryJob() : _lastDataSyncedFromLastRun(0), 
+                _mx("recovery"), _recovering(false) { _lastSeqMentionedInConsoleLog = 1; }
+            void go(vector<path>& files);
+            ~RecoveryJob();
+
+            /** @param data data between header and footer. compressed if recovering. */
+            void processSection(const JSectHeader *h, const void *data, unsigned len, const JSectFooter *f);
+
+            void close(); // locks and calls _close()
+
+            static RecoveryJob & get() { return _instance; }
+        private:
+            void write(const ParsedJournalEntry& entry); // actually writes to the file
+            void applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump);
+            void applyEntries(const vector<ParsedJournalEntry> &entries);
+            bool processFileBuffer(const void *, unsigned len);
+            bool processFile(path journalfile);
+            void _close(); // doesn't lock
+
+            list<boost::shared_ptr<MongoMMF> > _mmfs;
+
+            unsigned long long _lastDataSyncedFromLastRun;
+            unsigned long long _lastSeqMentionedInConsoleLog;
+        public:
+            mongo::mutex _mx; // protects _mmfs; see setNoJournal() too
+        private:
+            bool _recovering; // are we in recovery or WRITETODATAFILES
+
+            static RecoveryJob &_instance;
+        };
+    }
+}
diff --git a/src/mongo/db/dur_stats.h b/src/mongo/db/dur_stats.h
new file mode 100644
index 00000000000..50a26d1f215
--- /dev/null
+++ b/src/mongo/db/dur_stats.h
@@ -0,0 +1,49 @@
+// @file dur_stats.h
+
+namespace mongo {
+    namespace dur {
+
+        /** journaling stats.  the model here is that the commit thread is the only writer, and that reads are
+            uncommon (from a serverStatus command and such).  Thus, there should not be multicore chatter overhead.
+        */
+        struct Stats {
+            Stats();
+            void rotate();
+            BSONObj asObj();
+            unsigned _intervalMicros;
+            struct S {
+                BSONObj _asObj();
+                string _asCSV();
+                string _CSVHeader();
+                void reset();
+
+                unsigned _commits;
+                unsigned _earlyCommits; // count of early commits from commitIfNeeded() or from getDur().commitNow()
+                unsigned long long _journaledBytes;
+                unsigned long long _uncompressedBytes;
+                unsigned long long _writeToDataFilesBytes;
+
+                unsigned long long _prepLogBufferMicros;
+                unsigned long long _writeToJournalMicros;
+                unsigned long long _writeToDataFilesMicros;
+                unsigned long long _remapPrivateViewMicros;
+
+                // undesirable to be in write lock for the group commit (it can be done in a read lock), so good if we
+                // have visibility when this happens.  can happen for a couple reasons
+                // - read lock starvation
+                // - file being closed
+                // - data being written faster than the normal group commit interval
+                unsigned _commitsInWriteLock;
+
+                unsigned _dtMillis;
+            };
+            S *curr;
+        private:
+            S _a,_b;
+            unsigned long long _lastRotate;
+            S* other();
+        };
+        extern Stats stats;
+
+    }
+}
diff --git a/src/mongo/db/dur_writetodatafiles.cpp b/src/mongo/db/dur_writetodatafiles.cpp
new file mode 100644
index 00000000000..d77b0482c20
--- /dev/null
+++ b/src/mongo/db/dur_writetodatafiles.cpp
@@ -0,0 +1,94 @@
+// @file dur_writetodatafiles.cpp apply the writes back to the non-private MMF after they are for certain in redo log
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "dur_commitjob.h"
+#include "dur_stats.h"
+#include "dur_recover.h"
+#include "../util/timer.h"
+
+namespace mongo {
+    namespace dur {
+
+        void debugValidateAllMapsMatch();
+
+        static void WRITETODATAFILES_Impl1(const JSectHeader& h, AlignedBuilder& uncompressed) {
+            LockMongoFilesShared lk;
+            LOG(3) << "journal WRITETODATAFILES 1" << endl;
+            RecoveryJob::get().processSection(&h, uncompressed.buf(), uncompressed.len(), 0);
+            LOG(3) << "journal WRITETODATAFILES 2" << endl;
+        }
+
+#if 0
+        // the old implementation.  doesn't work with groupCommitWithLimitedLocks()
+        void WRITETODATAFILES_Impl2() {
+            /* we go backwards as what is at the end is most likely in the cpu cache.  it won't be much, but we'll take it. */
+            for( set<WriteIntent>::const_iterator it(commitJob.writes().begin()), end(commitJob.writes().end()); it != end; ++it ) {
+                const WriteIntent& intent = *it;
+                stats.curr->_writeToDataFilesBytes += intent.length();
+                dassert(intent.w_ptr);
+                memcpy(intent.w_ptr, intent.start(), intent.length());
+            }
+        }
+#endif
+
+#if defined(_EXPERIMENTAL)
+        // doesn't work with groupCommitWithLimitedLocks()
+        void WRITETODATAFILES_Impl3() {
+            /* we go backwards as what is at the end is most likely in the cpu cache.  it won't be much, but we'll take it. */
+            for( set<WriteIntent>::const_iterator it(commitJob.writes().begin()), end(commitJob.writes().end()); it != end; ++it ) {
+                const WriteIntent& intent = *it;
+                stats.curr->_writeToDataFilesBytes += intent.length();
+                dassert(intent.w_ptr);
+                memcpy(intent.w_ptr,
+                       commitJob._ab.atOfs(intent.ofsInJournalBuffer),
+                       intent.length());
+            }
+        }
+#endif
+
+        /** apply the writes back to the non-private MMF after they are for certain in redo log
+
+            (1) todo we don't need to write back everything every group commit.  we MUST write back
+            that which is going to be a remapped on its private view - but that might not be all
+            views.
+
+            (2) todo should we do this using N threads?  would be quite easy
+                see Hackenberg paper table 5 and 6.  2 threads might be a good balance.
+
+            (3) with enough work, we could do this outside the read lock.  it's a bit tricky though.
+                - we couldn't do it from the private views then as they may be changing.  would have to then
+                  be from the journal alignedbuffer.
+                - we need to be careful the file isn't unmapped on us -- perhaps a mutex or something
+                  with MongoMMF on closes or something to coordinate that.
+
+            concurrency: in mmmutex, not necessarily in dbMutex
+
+            @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc&hl=en
+        */
+
+        void WRITETODATAFILES(const JSectHeader& h, AlignedBuilder& uncompressed) {
+            Timer t;
+            WRITETODATAFILES_Impl1(h, uncompressed);
+            unsigned long long m = t.micros();
+            stats.curr->_writeToDataFilesMicros += m;
+            LOG(2) << "journal WRITETODATAFILES " << m / 1000.0 << "ms" << endl;
+        }
+
+    }
+}
diff --git a/src/mongo/db/durop.cpp b/src/mongo/db/durop.cpp
new file mode 100644
index 00000000000..80ee5043410
--- /dev/null
+++ b/src/mongo/db/durop.cpp
@@ -0,0 +1,161 @@
+// @file durop.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "concurrency.h"
+#include "../util/alignedbuilder.h"
+#include "../util/mongoutils/str.h"
+#include "../util/file.h"
+#include "mongommf.h"
+#include "durop.h"
+#include "../util/file_allocator.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+    extern string dbpath; // --dbpath parm
+
+    void _deleteDataFiles(const char *);
+
+    namespace dur {
+
+        /** read a durop from journal file referenced by br.
+            @param opcode the opcode which has already been written from the bufreader
+        */
+        shared_ptr<DurOp> DurOp::read(unsigned opcode, BufReader& br) {
+            shared_ptr<DurOp> op;
+            switch( opcode ) {
+            case JEntry::OpCode_FileCreated:
+                op = shared_ptr<DurOp>( new FileCreatedOp(br) );
+                break;
+            case JEntry::OpCode_DropDb:
+                op = shared_ptr<DurOp>( new DropDbOp(br) );
+                break;
+            default:
+                massert(13546, (str::stream() << "journal recover: unrecognized opcode in journal " << opcode), false);
+            }
+            return op;
+        }
+
+        void DurOp::serialize(AlignedBuilder& ab) {
+            ab.appendNum(_opcode);
+            _serialize(ab);
+        }
+
+        DropDbOp::DropDbOp(BufReader& log) : DurOp(JEntry::OpCode_DropDb) {
+            unsigned long long reserved;
+            log.read(reserved);
+            log.read(reserved);
+            log.readStr(_db);
+            string reservedStr;
+            log.readStr(reservedStr);
+        }
+
+        void DropDbOp::_serialize(AlignedBuilder& ab) {
+            ab.appendNum((unsigned long long) 0); // reserved for future use
+            ab.appendNum((unsigned long long) 0); // reserved for future use
+            ab.appendStr(_db);
+            ab.appendStr(""); // reserved
+        }
+
+        /** throws */
+        void DropDbOp::replay() {
+            log() << "recover replay drop db " << _db << endl;
+            _deleteDataFiles(_db.c_str());
+        }
+
+        FileCreatedOp::FileCreatedOp(string f, unsigned long long l) :
+            DurOp(JEntry::OpCode_FileCreated) {
+            _p = RelativePath::fromFullPath(f);
+            _len = l;
+        }
+
+        FileCreatedOp::FileCreatedOp(BufReader& log) : DurOp(JEntry::OpCode_FileCreated) {
+            unsigned long long reserved;
+            log.read(reserved);
+            log.read(reserved);
+            log.read(_len); // size of file, not length of name
+            string s;
+            log.readStr(s);
+            _p._p = s;
+        }
+
+        void FileCreatedOp::_serialize(AlignedBuilder& ab) {
+            ab.appendNum((unsigned long long) 0); // reserved for future use
+            ab.appendNum((unsigned long long) 0); // reserved for future use
+            ab.appendNum(_len);
+            ab.appendStr(_p.toString());
+        }
+
+        string FileCreatedOp::toString() {
+            return str::stream() << "FileCreatedOp " << _p.toString() << ' ' << _len/1024.0/1024.0 << "MB";
+        }
+
+        // if an operation deletes or creates a file (or moves etc.), it may need files closed.
+        bool FileCreatedOp::needFilesClosed() {
+            return exists( _p.asFullPath() );
+        }
+
+        void FileCreatedOp::replay() {
+            // i believe the code assumes new files are filled with zeros.  thus we have to recreate the file,
+            // or rewrite at least, even if it were the right length.  perhaps one day we should change that
+            // although easier to avoid defects if we assume it is zeros perhaps.
+            string full = _p.asFullPath();
+            if( exists(full) ) {
+                try {
+                    remove(full);
+                }
+                catch(std::exception& e) {
+                    log(1) << "recover info FileCreateOp::replay unlink " << e.what() << endl;
+                }
+            }
+
+            log() << "recover create file " << full << ' ' << _len/1024.0/1024.0 << "MB" << endl;
+            if( MemoryMappedFile::exists(full) ) {
+                // first delete if exists.
+                try {
+                    remove(full);
+                }
+                catch(...) {
+                    log() << "warning could not delete file " << full << endl;
+                }
+            }
+            ensureParentDirCreated(full);
+            File f;
+            f.open(full.c_str());
+            massert(13547, str::stream() << "recover couldn't create file " << full, f.is_open());
+            unsigned long long left = _len;
+            const unsigned blksz = 64 * 1024;
+            scoped_array<char> v( new char[blksz] );
+            memset( v.get(), 0, blksz );
+            fileofs ofs = 0;
+            while( left ) {
+                unsigned long long w = left < blksz ? left : blksz;
+                f.write(ofs, v.get(), (unsigned) w);
+                left -= w;
+                ofs += w;
+            }
+            f.fsync();
+            flushMyDirectory(full);
+            massert(13628, str::stream() << "recover failure writing file " << full, !f.bad() );
+        }
+
+    }
+
+}
diff --git a/src/mongo/db/durop.h b/src/mongo/db/durop.h
new file mode 100644
index 00000000000..9ab1bfcbede
--- /dev/null
+++ b/src/mongo/db/durop.h
@@ -0,0 +1,109 @@
+// @file durop.h class DurOp and descendants
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "dur_journalformat.h"
+#include "../util/bufreader.h"
+#include "../util/paths.h"
+
+namespace mongo {
+
+    class AlignedBuilder;
+
+    namespace dur {
+
+        /** DurOp - Operations we journal that aren't just basic writes.
+         *
+         *  Basic writes are logged as JEntry's, and indicated in ram temporarily as struct dur::WriteIntent.
+         *  We don't make WriteIntent inherit from DurOp to keep it as lean as possible as there will be millions of
+         *  them (we don't want a vtable for example there).
+         *
+         *  For each op we want to journal, we define a subclass.
+         */
+        class DurOp { /* copyable */
+        public:
+            // @param opcode a sentinel value near max unsigned which uniquely identifies the operation.
+            // @see dur::JEntry
+            DurOp(unsigned opcode) : _opcode(opcode) { }
+
+            virtual ~DurOp() { }
+
+            /** serialize the op out to a builder which will then be written (presumably) to the journal */
+            void serialize(AlignedBuilder& ab);
+
+            /** read a durop from journal file referenced by br.
+                @param opcode the opcode which has already been written from the bufreader
+            */
+            static shared_ptr<DurOp> read(unsigned opcode, BufReader& br);
+
+            /** replay the operation (during recovery)
+                throws
+
+                For now, these are not replayed during the normal WRITETODATAFILES phase, since these
+                operations are handled in other parts of the code. At some point this may change.
+            */
+            virtual void replay() = 0;
+
+            virtual string toString() = 0;
+
+            /** if the op requires all file to be closed before doing its work, returns true. */
+            virtual bool needFilesClosed() { return false; }
+
+        protected:
+            /** DurOp will have already written the opcode for you */
+            virtual void _serialize(AlignedBuilder& ab) = 0;
+
+        private:
+            const unsigned _opcode;
+        };
+
+        /** indicates creation of a new file */
+        class FileCreatedOp : public DurOp {
+        public:
+            FileCreatedOp(BufReader& log);
+            /** param f filename to create with path */
+            FileCreatedOp(string f, unsigned long long l);
+            virtual void replay();
+            virtual string toString();
+            virtual bool needFilesClosed();
+        protected:
+            virtual void _serialize(AlignedBuilder& ab);
+        private:
+            RelativePath _p;
+            unsigned long long _len; // size of file, not length of name
+        };
+
+        /** record drop of a database */
+        class DropDbOp : public DurOp {
+        public:
+            DropDbOp(BufReader& log);
+            DropDbOp(string db) :
+                DurOp(JEntry::OpCode_DropDb), _db(db) { }
+            virtual void replay();
+            virtual string toString() { return string("DropDbOp ") + _db; }
+            virtual bool needFilesClosed() { return true; }
+        protected:
+            virtual void _serialize(AlignedBuilder& ab);
+        private:
+            string _db;
+        };
+
+    }
+
+}
diff --git a/src/mongo/db/extsort.cpp b/src/mongo/db/extsort.cpp
new file mode 100644
index 00000000000..06a9756cc0a
--- /dev/null
+++ b/src/mongo/db/extsort.cpp
@@ -0,0 +1,245 @@
+// extsort.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "extsort.h"
+#include "namespace-inl.h"
+#include "../util/file.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+namespace mongo {
+
+    IndexInterface *BSONObjExternalSorter::extSortIdxInterface;
+    Ordering BSONObjExternalSorter::extSortOrder( Ordering::make(BSONObj()) );
+    unsigned long long BSONObjExternalSorter::_compares = 0;
+
+    BSONObjExternalSorter::BSONObjExternalSorter( IndexInterface &i, const BSONObj & order , long maxFileSize )
+        : _idxi(i), _order( order.getOwned() ) , _maxFilesize( maxFileSize ) ,
+          _arraySize(1000000), _cur(0), _curSizeSoFar(0), _sorted(0) {
+
+        stringstream rootpath;
+        rootpath << dbpath;
+        if ( dbpath[dbpath.size()-1] != '/' )
+            rootpath << "/";
+        rootpath << "_tmp/esort." << time(0) << "." << rand() << "/";
+        _root = rootpath.str();
+
+        log(1) << "external sort root: " << _root.string() << endl;
+
+        create_directories( _root );
+        _compares = 0;
+    }
+
+    BSONObjExternalSorter::~BSONObjExternalSorter() {
+        if ( _cur ) {
+            delete _cur;
+            _cur = 0;
+        }
+        unsigned long removed = remove_all( _root );
+        wassert( removed == 1 + _files.size() );
+    }
+
+    void BSONObjExternalSorter::_sortInMem() {
+        // extSortComp needs to use glbals
+        // qsort_r only seems available on bsd, which is what i really want to use
+        dblock l;
+        extSortIdxInterface = &_idxi;
+        extSortOrder = Ordering::make(_order);
+        _cur->sort( BSONObjExternalSorter::extSortComp );
+    }
+
+    void BSONObjExternalSorter::sort() {
+        uassert( 10048 ,  "already sorted" , ! _sorted );
+
+        _sorted = true;
+
+        if ( _cur && _files.size() == 0 ) {
+            _sortInMem();
+            log(1) << "\t\t not using file.  size:" << _curSizeSoFar << " _compares:" << _compares << endl;
+            return;
+        }
+
+        if ( _cur ) {
+            finishMap();
+        }
+
+        if ( _cur ) {
+            delete _cur;
+            _cur = 0;
+        }
+
+        if ( _files.size() == 0 )
+            return;
+
+    }
+
+    void BSONObjExternalSorter::add( const BSONObj& o , const DiskLoc & loc ) {
+        uassert( 10049 ,  "sorted already" , ! _sorted );
+
+        if ( ! _cur ) {
+            _cur = new InMemory( _arraySize );
+        }
+
+        Data& d = _cur->getNext();
+        d.first = o.getOwned();
+        d.second = loc;
+
+        long size = o.objsize();
+        _curSizeSoFar += size + sizeof( DiskLoc ) + sizeof( BSONObj );
+
+        if (  _cur->hasSpace() == false ||  _curSizeSoFar > _maxFilesize ) {
+            finishMap();
+            log(1) << "finishing map" << endl;
+        }
+
+    }
+
+    void BSONObjExternalSorter::finishMap() {
+        uassert( 10050 ,  "bad" , _cur );
+
+        _curSizeSoFar = 0;
+        if ( _cur->size() == 0 )
+            return;
+
+        _sortInMem();
+
+        stringstream ss;
+        ss << _root.string() << "/file." << _files.size();
+        string file = ss.str();
+
+        // todo: it may make sense to fadvise that this not be cached so that building the index doesn't 
+        //       eject other things the db is using from the file system cache.  while we will soon be reading 
+        //       this back, if it fit in ram, there wouldn't have been a need for an external sort in the first 
+        //       place.
+
+        ofstream out;
+        out.open( file.c_str() , ios_base::out | ios_base::binary );
+        assertStreamGood( 10051 ,  (string)"couldn't open file: " + file , out );
+
+        int num = 0;
+        for ( InMemory::iterator i=_cur->begin(); i != _cur->end(); ++i ) {
+            Data p = *i;
+            out.write( p.first.objdata() , p.first.objsize() );
+            out.write( (char*)(&p.second) , sizeof( DiskLoc ) );
+            num++;
+        }
+
+        _cur->clear();
+
+        _files.push_back( file );
+        out.close();
+
+        log(2) << "Added file: " << file << " with " << num << "objects for external sort" << endl;
+    }
+
+    // ---------------------------------
+
+    BSONObjExternalSorter::Iterator::Iterator( BSONObjExternalSorter * sorter ) :
+        _cmp( sorter->_idxi, sorter->_order ) , _in( 0 ) {
+
+        for ( list<string>::iterator i=sorter->_files.begin(); i!=sorter->_files.end(); i++ ) {
+            _files.push_back( new FileIterator( *i ) );
+            _stash.push_back( pair<Data,bool>( Data( BSONObj() , DiskLoc() ) , false ) );
+        }
+
+        if ( _files.size() == 0 && sorter->_cur ) {
+            _in = sorter->_cur;
+            _it = sorter->_cur->begin();
+        }
+    }
+
+    BSONObjExternalSorter::Iterator::~Iterator() {
+        for ( vector<FileIterator*>::iterator i=_files.begin(); i!=_files.end(); i++ )
+            delete *i;
+        _files.clear();
+    }
+
+    bool BSONObjExternalSorter::Iterator::more() {
+
+        if ( _in )
+            return _it != _in->end();
+
+        for ( vector<FileIterator*>::iterator i=_files.begin(); i!=_files.end(); i++ )
+            if ( (*i)->more() )
+                return true;
+        for ( vector< pair<Data,bool> >::iterator i=_stash.begin(); i!=_stash.end(); i++ )
+            if ( i->second )
+                return true;
+        return false;
+    }
+
+    BSONObjExternalSorter::Data BSONObjExternalSorter::Iterator::next() {
+
+        if ( _in ) {
+            Data& d = *_it;
+            ++_it;
+            return d;
+        }
+
+        Data best;
+        int slot = -1;
+
+        for ( unsigned i=0; i<_stash.size(); i++ ) {
+
+            if ( ! _stash[i].second ) {
+                if ( _files[i]->more() )
+                    _stash[i] = pair<Data,bool>( _files[i]->next() , true );
+                else
+                    continue;
+            }
+
+            if ( slot == -1 || _cmp( best , _stash[i].first ) == 0 ) {
+                best = _stash[i].first;
+                slot = i;
+            }
+
+        }
+
+        assert( slot >= 0 );
+        _stash[slot].second = false;
+
+        return best;
+    }
+
+    // -----------------------------------
+
+    BSONObjExternalSorter::FileIterator::FileIterator( string file ) {
+        unsigned long long length;
+        _buf = (char*)_file.map( file.c_str() , length , MemoryMappedFile::SEQUENTIAL );
+        massert( 10308 ,  "mmap failed" , _buf );
+        assert( length == (unsigned long long) file_size( file ) );
+        _end = _buf + length;
+    }
+    BSONObjExternalSorter::FileIterator::~FileIterator() {}
+
+    bool BSONObjExternalSorter::FileIterator::more() {
+        return _buf < _end;
+    }
+
+    BSONObjExternalSorter::Data BSONObjExternalSorter::FileIterator::next() {
+        BSONObj o( _buf );
+        _buf += o.objsize();
+        DiskLoc * l = (DiskLoc*)_buf;
+        _buf += 8;
+        return Data( o , *l );
+    }
+
+}
diff --git a/src/mongo/db/extsort.h b/src/mongo/db/extsort.h
new file mode 100644
index 00000000000..15a6d441849
--- /dev/null
+++ b/src/mongo/db/extsort.h
@@ -0,0 +1,150 @@
+// extsort.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+#include "namespace-inl.h"
+#include "curop-inl.h"
+#include "../util/array.h"
+
+namespace mongo {
+
+    /**
+       for external (disk) sorting by BSONObj and attaching a value
+     */
+    class BSONObjExternalSorter : boost::noncopyable {
+    public:
+        BSONObjExternalSorter( IndexInterface &i, const BSONObj & order = BSONObj() , long maxFileSize = 1024 * 1024 * 100 );
+        ~BSONObjExternalSorter();
+        typedef pair<BSONObj,DiskLoc> Data;
+ 
+    private:
+        IndexInterface& _idxi;
+
+        static int _compare(IndexInterface& i, const Data& l, const Data& r, const Ordering& order) { 
+            RARELY killCurrentOp.checkForInterrupt();
+            _compares++;
+            int x = i.keyCompare(l.first, r.first, order);
+            if ( x )
+                return x;
+            return l.second.compare( r.second );
+        }
+
+        class MyCmp {
+        public:
+            MyCmp( IndexInterface& i, BSONObj order = BSONObj() ) : _i(i), _order( Ordering::make(order) ) {}
+            bool operator()( const Data &l, const Data &r ) const {
+                return _compare(_i, l, r, _order) < 0;
+            };
+        private:
+            IndexInterface& _i;
+            const Ordering _order;
+        };
+
+        static IndexInterface *extSortIdxInterface;
+        static Ordering extSortOrder;
+        static int extSortComp( const void *lv, const void *rv ) {
+            DEV RARELY {                 
+                d.dbMutex.assertWriteLocked(); // must be as we use a global var
+            }
+            Data * l = (Data*)lv;
+            Data * r = (Data*)rv;
+            return _compare(*extSortIdxInterface, *l, *r, extSortOrder);
+        };
+
+        class FileIterator : boost::noncopyable {
+        public:
+            FileIterator( string file );
+            ~FileIterator();
+            bool more();
+            Data next();
+        private:
+            MemoryMappedFile _file;
+            char * _buf;
+            char * _end;
+        };
+
+    public:
+
+        typedef FastArray<Data> InMemory;
+
+        class Iterator : boost::noncopyable {
+        public:
+
+            Iterator( BSONObjExternalSorter * sorter );
+            ~Iterator();
+            bool more();
+            Data next();
+
+        private:
+            MyCmp _cmp;
+            vector<FileIterator*> _files;
+            vector< pair<Data,bool> > _stash;
+
+            InMemory * _in;
+            InMemory::iterator _it;
+
+        };
+
+        void add( const BSONObj& o , const DiskLoc & loc );
+        void add( const BSONObj& o , int a , int b ) {
+            add( o , DiskLoc( a , b ) );
+        }
+
+        /* call after adding values, and before fetching the iterator */
+        void sort();
+
+        auto_ptr<Iterator> iterator() {
+            uassert( 10052 ,  "not sorted" , _sorted );
+            return auto_ptr<Iterator>( new Iterator( this ) );
+        }
+
+        int numFiles() {
+            return _files.size();
+        }
+
+        long getCurSizeSoFar() { return _curSizeSoFar; }
+
+        void hintNumObjects( long long numObjects ) {
+            if ( numObjects < _arraySize )
+                _arraySize = (int)(numObjects + 100);
+        }
+
+    private:
+
+        void _sortInMem();
+
+        void sort( string file );
+        void finishMap();
+
+        BSONObj _order;
+        long _maxFilesize;
+        path _root;
+
+        int _arraySize;
+        InMemory * _cur;
+        long _curSizeSoFar;
+
+        list<string> _files;
+        bool _sorted;
+
+        static unsigned long long _compares;
+    };
+}
diff --git a/src/mongo/db/filever.h b/src/mongo/db/filever.h
new file mode 100644
index 00000000000..e89a8243dcf
--- /dev/null
+++ b/src/mongo/db/filever.h
@@ -0,0 +1,30 @@
+/* filever.h */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    inline void checkDataFileVersion(NamespaceDetails& d) {
+    }
+
+    inline void checkIndexFileVersion(NamespaceDetails& d) {
+    }
+
+}
+
diff --git a/src/mongo/db/flushtest.cpp b/src/mongo/db/flushtest.cpp
new file mode 100644
index 00000000000..2009d922950
--- /dev/null
+++ b/src/mongo/db/flushtest.cpp
@@ -0,0 +1,150 @@
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include <stdio.h>
+#include "../util/goodies.h"
+#include <fcntl.h>
+
+namespace mongo {
+
+#if defined(F_FULLFSYNC)
+    void fullsync(int f) {
+        fcntl( f, F_FULLFSYNC );
+    }
+#else
+    void fullsync(int f) {
+        fdatasync(f);
+    }
+#endif
+
+    int main(int argc, char* argv[], char *envp[] ) {
+        cout << "hello" << endl;
+
+        FILE *f = fopen("/data/db/temptest", "a");
+
+        if ( f == 0 ) {
+            cout << "can't open file\n";
+            return 1;
+        }
+
+        {
+            Timer t;
+            for ( int i = 0; i < 50000; i++ )
+                fwrite("abc", 3, 1, f);
+            cout << "small writes: " << t.millis() << "ms" << endl;
+        }
+
+        {
+            Timer t;
+            for ( int i = 0; i < 10000; i++ ) {
+                fwrite("abc", 3, 1, f);
+                fflush(f);
+                fsync( fileno( f ) );
+            }
+            int ms = t.millis();
+            cout << "flush: " << ms << "ms, " << ms / 10000.0 << "ms/request" << endl;
+        }
+
+        {
+            Timer t;
+            for ( int i = 0; i < 500; i++ ) {
+                fwrite("abc", 3, 1, f);
+                fflush(f);
+                fsync( fileno( f ) );
+                sleepmillis(2);
+            }
+            int ms = t.millis() - 500 * 2;
+            cout << "flush with sleeps: " << ms << "ms, " << ms / 500.0 << "ms/request" << endl;
+        }
+
+        char buf[8192];
+        for ( int pass = 0; pass < 2; pass++ ) {
+            cout << "pass " << pass << endl;
+            {
+                Timer t;
+                int n = 500;
+                for ( int i = 0; i < n; i++ ) {
+                    if ( pass == 0 )
+                        fwrite("abc", 3, 1, f);
+                    else
+                        fwrite(buf, 8192, 1, f);
+                    buf[0]++;
+                    fflush(f);
+                    fullsync(fileno(f));
+                }
+                int ms = t.millis();
+                cout << "fullsync: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl;
+            }
+
+            {
+                Timer t;
+                for ( int i = 0; i < 500; i++ ) {
+                    if ( pass == 0 )
+                        fwrite("abc", 3, 1, f);
+                    else
+                        fwrite(buf, 8192, 1, f);
+                    buf[0]++;
+                    fflush(f);
+                    fullsync(fileno(f));
+                    sleepmillis(2);
+                }
+                int ms = t.millis() - 2 * 500;
+                cout << "fullsync with sleeps: " << ms << "ms, " << ms / 500.0 << "ms/request" << endl;
+            }
+        }
+
+        // without growing
+        {
+            fclose(f);
+            /* try from beginning of the file, where we aren't appending and changing the file length,
+               to see if this is faster as the directory entry then doesn't have to be flushed (if noatime in effect).
+            */
+            f = fopen("/data/db/temptest", "r+");
+            Timer t;
+            int n = 500;
+            for ( int i = 0; i < n; i++ ) {
+                fwrite("xyz", 3, 1, f);
+                fflush(f);
+                fullsync(fileno(f));
+            }
+            int ms = t.millis();
+            cout << "fullsync without growing: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl;
+        }
+
+        // without growing, with delay
+        {
+            fclose(f);
+            /* try from beginning of the file, where we aren't appending and changing the file length,
+               to see if this is faster as the directory entry then doesn't have to be flushed (if noatime in effect).
+            */
+            f = fopen("/data/db/temptest", "r+");
+            Timer t;
+            int n = 500;
+            for ( int i = 0; i < n; i++ ) {
+                fwrite("xyz", 3, 1, f);
+                fflush(f);
+                fullsync(fileno(f));
+                sleepmillis(2);
+            }
+            int ms = t.millis() - 2 * 500;
+            cout << "fullsync without growing with sleeps: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl;
+        }
+
+        return 0;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/geo/2d.cpp b/src/mongo/db/geo/2d.cpp
new file mode 100644
index 00000000000..f05ce4315b2
--- /dev/null
+++ b/src/mongo/db/geo/2d.cpp
@@ -0,0 +1,3289 @@
+// geo2d.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../namespace-inl.h"
+#include "../jsobj.h"
+#include "../index.h"
+#include "../../util/unittest.h"
+#include "../commands.h"
+#include "../pdfile.h"
+#include "../btree.h"
+#include "../curop-inl.h"
+#include "../matcher.h"
+#include "../queryutil.h"
+#include "core.h"
+#include "../../util/timer.h"
+
+// Note: we use indexinterface herein to talk to the btree code. In the future it would be nice to 
+//       be able to use the V1 key class (see key.h) instead of toBson() which has some cost.
+//       toBson() is new with v1 so this could be slower than it used to be?  a quick profiling
+//       might make sense.
+
+namespace mongo {
+
+    class GeoKeyNode { 
+        GeoKeyNode();
+    public:
+        GeoKeyNode( DiskLoc bucket, int keyOfs, DiskLoc r, BSONObj k) : _bucket( bucket ), _keyOfs( keyOfs ), recordLoc(r), _key(k) { }
+        const DiskLoc _bucket;
+        const int _keyOfs;
+        const DiskLoc recordLoc;
+        const BSONObj _key;
+    };
+
+    // just use old indexes for geo for now. todo.
+//    typedef BtreeBucket<V0> GeoBtreeBucket;
+//    typedef GeoBtreeBucket::KeyNode GeoKeyNode;
+
+//#define BTREE btree<V0>
+
+#if 0
+# define CDEBUG -1
+#else
+# define CDEBUG 10
+#endif
+
+#if 0
+# define GEODEBUGGING
+# define GEODEBUG(x) cout << x << endl;
+# define GEODEBUGPRINT(x) PRINT(x)
+    inline void PREFIXDEBUG(GeoHash prefix, const GeoConvert* g) {
+        if (!prefix.constrains()) {
+            cout << "\t empty prefix" << endl;
+            return ;
+        }
+
+        Point ll (g, prefix); // lower left
+        prefix.move(1,1);
+        Point tr (g, prefix); // top right
+
+        Point center ( (ll._x+tr._x)/2, (ll._y+tr._y)/2 );
+        double radius = fabs(ll._x - tr._x) / 2;
+
+        cout << "\t ll: " << ll.toString() << " tr: " << tr.toString()
+             << " center: " << center.toString() << " radius: " << radius << endl;
+
+    }
+#else
+# define GEODEBUG(x)
+# define GEODEBUGPRINT(x)
+# define PREFIXDEBUG(x, y)
+#endif
+
+    const double EARTH_RADIUS_KM = 6371;
+    const double EARTH_RADIUS_MILES = EARTH_RADIUS_KM * 0.621371192;
+
+    enum GeoDistType {
+        GEO_PLAIN,
+        GEO_SPHERE
+    };
+
+    inline double computeXScanDistance(double y, double maxDistDegrees) {
+        // TODO: this overestimates for large madDistDegrees far from the equator
+        return maxDistDegrees / min(cos(deg2rad(min(+89.0, y + maxDistDegrees))),
+                                    cos(deg2rad(max(-89.0, y - maxDistDegrees))));
+    }
+
+    GeoBitSets geoBitSets;
+
+    const string GEO2DNAME = "2d";
+
+    class Geo2dType : public IndexType , public GeoConvert {
+    public:
+        virtual ~Geo2dType() { }
+
+        Geo2dType( const IndexPlugin * plugin , const IndexSpec* spec )
+            : IndexType( plugin , spec ) {
+
+            BSONObjBuilder orderBuilder;
+
+            BSONObjIterator i( spec->keyPattern );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                if ( e.type() == String && GEO2DNAME == e.valuestr() ) {
+                    uassert( 13022 , "can't have 2 geo field" , _geo.size() == 0 );
+                    uassert( 13023 , "2d has to be first in index" , _other.size() == 0 );
+                    _geo = e.fieldName();
+                }
+                else {
+                    _other.push_back( e.fieldName() );
+                }
+                orderBuilder.append( "" , 1 );
+            }
+
+            uassert( 13024 , "no geo field specified" , _geo.size() );
+
+            double bits =  _configval( spec , "bits" , 26 ); // for lat/long, ~ 1ft
+
+            uassert( 13028 , "bits in geo index must be between 1 and 32" , bits > 0 && bits <= 32 );
+
+            _bits = (unsigned) bits;
+
+            _max = _configval( spec , "max" , 180.0 );
+            _min = _configval( spec , "min" , -180.0 );
+
+            double numBuckets = (1024 * 1024 * 1024 * 4.0);
+
+            _scaling = numBuckets / ( _max - _min );
+
+            _order = orderBuilder.obj();
+
+            GeoHash a(0, 0, _bits);
+            GeoHash b = a;
+            b.move(1, 1);
+
+            // Epsilon is 1/100th of a bucket size
+            // TODO:  Can we actually find error bounds for the sqrt function?
+            double epsilon = 0.001 / _scaling;
+            _error = distance(a, b) + epsilon;
+
+            // Error in radians
+            _errorSphere = deg2rad( _error );
+
+        }
+
+        double _configval( const IndexSpec* spec , const string& name , double def ) {
+            BSONElement e = spec->info[name];
+            if ( e.isNumber() ) {
+                return e.numberDouble();
+            }
+            return def;
+        }
+
+        virtual BSONObj fixKey( const BSONObj& in ) {
+            if ( in.firstElement().type() == BinData )
+                return in;
+
+            BSONObjBuilder b(in.objsize()+16);
+
+            if ( in.firstElement().isABSONObj() )
+                _hash( in.firstElement().embeddedObject() ).append( b , "" );
+            else if ( in.firstElement().type() == String )
+                GeoHash( in.firstElement().valuestr() ).append( b , "" );
+            else if ( in.firstElement().type() == RegEx )
+                GeoHash( in.firstElement().regex() ).append( b , "" );
+            else
+                return in;
+
+            BSONObjIterator i(in);
+            i.next();
+            while ( i.more() )
+                b.append( i.next() );
+            return b.obj();
+        }
+
+        /** Finds the key objects to put in an index */
+        virtual void getKeys( const BSONObj& obj, BSONObjSet& keys ) const {
+            getKeys( obj, &keys, NULL );
+        }
+
+        /** Finds all locations in a geo-indexed object */
+        // TODO:  Can we just return references to the locs, if they won't change?
+        void getKeys( const BSONObj& obj, vector< BSONObj >& locs ) const {
+            getKeys( obj, NULL, &locs );
+        }
+
+        /** Finds the key objects and/or locations for a geo-indexed object */
+        void getKeys( const BSONObj &obj, BSONObjSet* keys, vector< BSONObj >* locs ) const {
+
+            BSONElementMSet bSet;
+
+            // Get all the nested location fields, but don't return individual elements from
+            // the last array, if it exists.
+            obj.getFieldsDotted(_geo.c_str(), bSet, false);
+
+            if( bSet.empty() )
+                return;
+
+            for( BSONElementMSet::iterator setI = bSet.begin(); setI != bSet.end(); ++setI ) {
+
+                BSONElement geo = *setI;
+
+                GEODEBUG( "Element " << geo << " found for query " << _geo.c_str() );
+
+                if ( geo.eoo() || ! geo.isABSONObj() )
+                    continue;
+
+                //
+                // Grammar for location lookup:
+                // locs ::= [loc,loc,...,loc]|{<k>:loc,<k>:loc,...,<k>:loc}|loc
+                // loc  ::= { <k1> : #, <k2> : # }|[#, #]|{}
+                //
+                // Empty locations are ignored, preserving single-location semantics
+                //
+
+                BSONObj embed = geo.embeddedObject();
+                if ( embed.isEmpty() )
+                    continue;
+
+                // Differentiate between location arrays and locations
+                // by seeing if the first element value is a number
+                bool singleElement = embed.firstElement().isNumber();
+
+                BSONObjIterator oi(embed);
+
+                while( oi.more() ) {
+
+                    BSONObj locObj;
+
+                    if( singleElement ) locObj = embed;
+                    else {
+                        BSONElement locElement = oi.next();
+
+                        uassert( 13654, str::stream() << "location object expected, location array not in correct format",
+                                 locElement.isABSONObj() );
+
+                        locObj = locElement.embeddedObject();
+
+                        if( locObj.isEmpty() )
+                            continue;
+                    }
+
+                    BSONObjBuilder b(64);
+
+                    // Remember the actual location object if needed
+                    if( locs )
+                        locs->push_back( locObj );
+
+                    // Stop if we don't need to get anything but location objects
+                    if( ! keys ) {
+                        if( singleElement ) break;
+                        else continue;
+                    }
+
+                    _hash( locObj ).append( b , "" );
+
+                    // Go through all the other index keys
+                    for ( vector<string>::const_iterator i = _other.begin(); i != _other.end(); ++i ) {
+
+                        // Get *all* fields for the index key
+                        BSONElementSet eSet;
+                        obj.getFieldsDotted( *i, eSet );
+
+
+                        if ( eSet.size() == 0 )
+                            b.appendAs( _spec->missingField(), "" );
+                        else if ( eSet.size() == 1 )
+                            b.appendAs( *(eSet.begin()), "" );
+                        else {
+
+                            // If we have more than one key, store as an array of the objects
+
+                            BSONArrayBuilder aBuilder;
+
+                            for( BSONElementSet::iterator ei = eSet.begin(); ei != eSet.end(); ++ei ) {
+                                aBuilder.append( *ei );
+                            }
+
+                            BSONArray arr = aBuilder.arr();
+
+                            b.append( "", arr );
+
+                        }
+
+                    }
+
+                    keys->insert( b.obj() );
+
+                    if( singleElement ) break;
+
+                }
+            }
+
+        }
+
+        BSONObj _fromBSONHash( const BSONElement& e ) const {
+            return _unhash( _tohash( e ) );
+        }
+
+        BSONObj _fromBSONHash( const BSONObj& o ) const {
+            return _unhash( _tohash( o.firstElement() ) );
+        }
+
+        GeoHash _tohash( const BSONElement& e ) const {
+            if ( e.isABSONObj() )
+                return _hash( e.embeddedObject() );
+
+            return GeoHash( e , _bits );
+        }
+
+        GeoHash _hash( const BSONObj& o ) const {
+            BSONObjIterator i(o);
+            uassert( 13067 , "geo field is empty" , i.more() );
+            BSONElement x = i.next();
+            uassert( 13068 , "geo field only has 1 element" , i.more() );
+            BSONElement y = i.next();
+
+            uassert( 13026 , "geo values have to be numbers: " + o.toString() , x.isNumber() && y.isNumber() );
+
+            return hash( x.number() , y.number() );
+        }
+
+        GeoHash hash( const Point& p ) const {
+            return hash( p._x, p._y );
+        }
+
+        GeoHash hash( double x , double y ) const {
+            return GeoHash( _convert(x), _convert(y) , _bits );
+        }
+
+        BSONObj _unhash( const GeoHash& h ) const {
+            unsigned x , y;
+            h.unhash( x , y );
+            BSONObjBuilder b;
+            b.append( "x" , _unconvert( x ) );
+            b.append( "y" , _unconvert( y ) );
+            return b.obj();
+        }
+
+        unsigned _convert( double in ) const {
+            uassert( 13027 , str::stream() << "point not in interval of [ " << _min << ", " << _max << " )", in < _max && in >= _min );
+            in -= _min;
+            assert( in >= 0 );
+            return (unsigned)(in * _scaling);
+        }
+
+        double _unconvert( unsigned in ) const {
+            double x = in;
+            x /= _scaling;
+            x += _min;
+            return x;
+        }
+
+        void unhash( const GeoHash& h , double& x , double& y ) const {
+            unsigned a,b;
+            h.unhash(a,b);
+            x = _unconvert( a );
+            y = _unconvert( b );
+        }
+
+        double distance( const GeoHash& a , const GeoHash& b ) const {
+            double ax,ay,bx,by;
+            unhash( a , ax , ay );
+            unhash( b , bx , by );
+
+            double dx = bx - ax;
+            double dy = by - ay;
+
+            return sqrt( ( dx * dx ) + ( dy * dy ) );
+        }
+
+        double sizeDiag( const GeoHash& a ) const {
+            GeoHash b = a;
+            b.move( 1 , 1 );
+            return distance( a , b );
+        }
+
+        double sizeEdge( const GeoHash& a ) const {
+
+            if( ! a.constrains() )
+                return _max - _min;
+
+            double ax,ay,bx,by;
+            GeoHash b = a;
+            b.move( 1 , 1 );
+            unhash( a, ax, ay );
+            unhash( b, bx, by );
+
+            // _min and _max are a singularity
+            if (bx == _min)
+                bx = _max;
+
+            return (fabs(ax-bx));
+        }
+
+        const IndexDetails* getDetails() const {
+            return _spec->getDetails();
+        }
+
+        virtual shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const;
+
+        virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const {
+            BSONElement e = query.getFieldDotted(_geo.c_str());
+            switch ( e.type() ) {
+            case Object: {
+                BSONObj sub = e.embeddedObject();
+                switch ( sub.firstElement().getGtLtOp() ) {
+                case BSONObj::opNEAR:
+                case BSONObj::opWITHIN:
+                    return OPTIMAL;
+                default:
+                    // We can try to match if there's no other indexing defined,
+                    // this is assumed a point
+                    return HELPFUL;
+                }
+            }
+            case Array:
+                // We can try to match if there's no other indexing defined,
+                // this is assumed a point
+                return HELPFUL;
+            default:
+                return USELESS;
+            }
+        }
+
+        string _geo;
+        vector<string> _other;
+
+        unsigned _bits;
+        double _max;
+        double _min;
+        double _scaling;
+
+        BSONObj _order;
+        double _error;
+        double _errorSphere;
+    };
+
+    class Box {
+    public:
+
+        Box( const Geo2dType * g , const GeoHash& hash )
+            : _min( g , hash ) ,
+              _max( _min._x + g->sizeEdge( hash ) , _min._y + g->sizeEdge( hash ) ) {
+        }
+
+        Box( double x , double y , double size )
+            : _min( x , y ) ,
+              _max( x + size , y + size ) {
+        }
+
+        Box( Point min , Point max )
+            : _min( min ) , _max( max ) {
+        }
+
+        Box() {}
+
+        BSONArray toBSON() const {
+            return BSON_ARRAY( BSON_ARRAY( _min._x << _min._y ) << BSON_ARRAY( _max._x << _max._y ) );
+        }
+
+        string toString() const {
+            StringBuilder buf(64);
+            buf << _min.toString() << " -->> " << _max.toString();
+            return buf.str();
+        }
+
+        bool between( double min , double max , double val , double fudge=0) const {
+            return val + fudge >= min && val <= max + fudge;
+        }
+
+        bool onBoundary( double bound, double val, double fudge = 0 ) const {
+            return ( val >= bound - fudge && val <= bound + fudge );
+        }
+
+        bool mid( double amin , double amax , double bmin , double bmax , bool min , double& res ) const {
+            assert( amin <= amax );
+            assert( bmin <= bmax );
+
+            if ( amin < bmin ) {
+                if ( amax < bmin )
+                    return false;
+                res = min ? bmin : amax;
+                return true;
+            }
+            if ( amin > bmax )
+                return false;
+            res = min ? amin : bmax;
+            return true;
+        }
+
+        double intersects( const Box& other ) const {
+
+            Point boundMin(0,0);
+            Point boundMax(0,0);
+
+            if ( mid( _min._x , _max._x , other._min._x , other._max._x , true , boundMin._x ) == false ||
+                    mid( _min._x , _max._x , other._min._x , other._max._x , false , boundMax._x ) == false ||
+                    mid( _min._y , _max._y , other._min._y , other._max._y , true , boundMin._y ) == false ||
+                    mid( _min._y , _max._y , other._min._y , other._max._y , false , boundMax._y ) == false )
+                return 0;
+
+            Box intersection( boundMin , boundMax );
+
+            return intersection.area() / area();
+        }
+
+        double area() const {
+            return ( _max._x - _min._x ) * ( _max._y - _min._y );
+        }
+
+        double maxDim() const {
+            return max( _max._x - _min._x, _max._y - _min._y );
+        }
+
+        Point center() const {
+            return Point( ( _min._x + _max._x ) / 2 ,
+                          ( _min._y + _max._y ) / 2 );
+        }
+
+        void truncate( const Geo2dType* g ) {
+            if( _min._x < g->_min ) _min._x = g->_min;
+            if( _min._y < g->_min ) _min._y = g->_min;
+            if( _max._x > g->_max ) _max._x = g->_max;
+            if( _max._y > g->_max ) _max._y = g->_max;
+        }
+
+        void fudge( const Geo2dType* g ) {
+            _min._x -= g->_error;
+            _min._y -= g->_error;
+            _max._x += g->_error;
+            _max._y += g->_error;
+        }
+
+        bool onBoundary( Point p, double fudge = 0 ) {
+            return onBoundary( _min._x, p._x, fudge ) ||
+                   onBoundary( _max._x, p._x, fudge ) ||
+                   onBoundary( _min._y, p._y, fudge ) ||
+                   onBoundary( _max._y, p._y, fudge );
+        }
+
+        bool inside( Point p , double fudge = 0 ) {
+            bool res = inside( p._x , p._y , fudge );
+            //cout << "is : " << p.toString() << " in " << toString() << " = " << res << endl;
+            return res;
+        }
+
+        bool inside( double x , double y , double fudge = 0 ) {
+            return
+                between( _min._x , _max._x  , x , fudge ) &&
+                between( _min._y , _max._y  , y , fudge );
+        }
+
+        bool contains(const Box& other, double fudge=0) {
+            return inside(other._min, fudge) && inside(other._max, fudge);
+        }
+
+        Point _min;
+        Point _max;
+    };
+
+
+    class Polygon {
+    public:
+
+        Polygon( void ) : _centroidCalculated( false ) {}
+
+        Polygon( vector<Point> points ) : _centroidCalculated( false ),
+            _points( points ) { }
+
+        void add( Point p ) {
+            _centroidCalculated = false;
+            _points.push_back( p );
+        }
+
+        int size( void ) const {
+            return _points.size();
+        }
+
+        /**
+         * Determine if the point supplied is contained by the current polygon.
+         *
+         * The algorithm uses a ray casting method.
+         */
+        bool contains( const Point& p ) const {
+            return contains( p, 0 ) > 0;
+        }
+
+        int contains( const Point &p, double fudge ) const {
+
+            Box fudgeBox( Point( p._x - fudge, p._y - fudge ), Point( p._x + fudge, p._y + fudge ) );
+
+            int counter = 0;
+            Point p1 = _points[0];
+            for ( int i = 1; i <= size(); i++ ) {
+                Point p2 = _points[i % size()];
+
+                GEODEBUG( "Doing intersection check of " << fudgeBox.toString() << " with seg " << p1.toString() << " to " << p2.toString() );
+
+                // We need to check whether or not this segment intersects our error box
+                if( fudge > 0 &&
+                        // Points not too far below box
+                        fudgeBox._min._y <= std::max( p1._y, p2._y ) &&
+                        // Points not too far above box
+                        fudgeBox._max._y >= std::min( p1._y, p2._y ) &&
+                        // Points not too far to left of box
+                        fudgeBox._min._x <= std::max( p1._x, p2._x ) &&
+                        // Points not too far to right of box
+                        fudgeBox._max._x >= std::min( p1._x, p2._x ) ) {
+
+                    GEODEBUG( "Doing detailed check" );
+
+                    // If our box contains one or more of these points, we need to do an exact check.
+                    if( fudgeBox.inside(p1) ) {
+                        GEODEBUG( "Point 1 inside" );
+                        return 0;
+                    }
+                    if( fudgeBox.inside(p2) ) {
+                        GEODEBUG( "Point 2 inside" );
+                        return 0;
+                    }
+
+                    // Do intersection check for vertical sides
+                    if ( p1._y != p2._y ) {
+
+                        double invSlope = ( p2._x - p1._x ) / ( p2._y - p1._y );
+
+                        double xintersT = ( fudgeBox._max._y - p1._y ) * invSlope + p1._x;
+                        if( fudgeBox._min._x <= xintersT && fudgeBox._max._x >= xintersT ) {
+                            GEODEBUG( "Top intersection @ " << xintersT );
+                            return 0;
+                        }
+
+                        double xintersB = ( fudgeBox._min._y - p1._y ) * invSlope + p1._x;
+                        if( fudgeBox._min._x <= xintersB && fudgeBox._max._x >= xintersB ) {
+                            GEODEBUG( "Bottom intersection @ " << xintersB );
+                            return 0;
+                        }
+
+                    }
+
+                    // Do intersection check for horizontal sides
+                    if( p1._x != p2._x ) {
+
+                        double slope = ( p2._y - p1._y ) / ( p2._x - p1._x );
+
+                        double yintersR = ( p1._x - fudgeBox._max._x ) * slope + p1._y;
+                        if( fudgeBox._min._y <= yintersR && fudgeBox._max._y >= yintersR ) {
+                            GEODEBUG( "Right intersection @ " << yintersR );
+                            return 0;
+                        }
+
+                        double yintersL = ( p1._x - fudgeBox._min._x ) * slope + p1._y;
+                        if( fudgeBox._min._y <= yintersL && fudgeBox._max._y >= yintersL ) {
+                            GEODEBUG( "Left intersection @ " << yintersL );
+                            return 0;
+                        }
+
+                    }
+
+                }
+                else if( fudge == 0 ){
+
+                    // If this is an exact vertex, we won't intersect, so check this
+                    if( p._y == p1._y && p._x == p1._x ) return 1;
+                    else if( p._y == p2._y && p._x == p2._x ) return 1;
+
+                    // If this is a horizontal line we won't intersect, so check this
+                    if( p1._y == p2._y && p._y == p1._y ){
+                        // Check that the x-coord lies in the line
+                        if( p._x >= std::min( p1._x, p2._x ) && p._x <= std::max( p1._x, p2._x ) ) return 1;
+                    }
+
+                }
+
+                // Normal intersection test.
+                // TODO: Invert these for clearer logic?
+                if ( p._y > std::min( p1._y, p2._y ) ) {
+                    if ( p._y <= std::max( p1._y, p2._y ) ) {
+                        if ( p._x <= std::max( p1._x, p2._x ) ) {
+                            if ( p1._y != p2._y ) {
+                                double xinters = (p._y-p1._y)*(p2._x-p1._x)/(p2._y-p1._y)+p1._x;
+                                // Special case of point on vertical line
+                                if ( p1._x == p2._x && p._x == p1._x ){
+
+                                    // Need special case for the vertical edges, for example:
+                                    // 1) \e   pe/----->
+                                    // vs.
+                                    // 2) \ep---e/----->
+                                    //
+                                    // if we count exact as intersection, then 1 is in but 2 is out
+                                    // if we count exact as no-int then 1 is out but 2 is in.
+
+                                    return 1;
+                                }
+                                else if( p1._x == p2._x || p._x <= xinters ) {
+                                    counter++;
+                                }
+                            }
+                        }
+                    }
+                }
+
+                p1 = p2;
+            }
+
+            if ( counter % 2 == 0 ) {
+                return -1;
+            }
+            else {
+                return 1;
+            }
+        }
+
+        /**
+         * Calculate the centroid, or center of mass of the polygon object.
+         */
+        Point centroid( void ) {
+
+            /* Centroid is cached, it won't change betwen points */
+            if ( _centroidCalculated ) {
+                return _centroid;
+            }
+
+            Point cent;
+            double signedArea = 0.0;
+            double area = 0.0;  // Partial signed area
+
+            /// For all vertices except last
+            int i = 0;
+            for ( i = 0; i < size() - 1; ++i ) {
+                area = _points[i]._x * _points[i+1]._y - _points[i+1]._x * _points[i]._y ;
+                signedArea += area;
+                cent._x += ( _points[i]._x + _points[i+1]._x ) * area;
+                cent._y += ( _points[i]._y + _points[i+1]._y ) * area;
+            }
+
+            // Do last vertex
+            area = _points[i]._x * _points[0]._y - _points[0]._x * _points[i]._y;
+            cent._x += ( _points[i]._x + _points[0]._x ) * area;
+            cent._y += ( _points[i]._y + _points[0]._y ) * area;
+            signedArea += area;
+            signedArea *= 0.5;
+            cent._x /= ( 6 * signedArea );
+            cent._y /= ( 6 * signedArea );
+
+            _centroidCalculated = true;
+            _centroid = cent;
+
+            return cent;
+        }
+
+        Box bounds( void ) {
+
+            // TODO: Cache this
+
+            _bounds._max = _points[0];
+            _bounds._min = _points[0];
+
+            for ( int i = 1; i < size(); i++ ) {
+
+                _bounds._max._x = max( _bounds._max._x, _points[i]._x );
+                _bounds._max._y = max( _bounds._max._y, _points[i]._y );
+                _bounds._min._x = min( _bounds._min._x, _points[i]._x );
+                _bounds._min._y = min( _bounds._min._y, _points[i]._y );
+
+            }
+
+            return _bounds;
+
+        }
+
+    private:
+
+        bool _centroidCalculated;
+        Point _centroid;
+
+        Box _bounds;
+
+        vector<Point> _points;
+    };
+
+    class Geo2dPlugin : public IndexPlugin {
+    public:
+        Geo2dPlugin() : IndexPlugin( GEO2DNAME ) {
+        }
+
+        virtual IndexType* generate( const IndexSpec* spec ) const {
+            return new Geo2dType( this , spec );
+        }
+    } geo2dplugin;
+
+    void __forceLinkGeoPlugin() {
+        geo2dplugin.getName();
+    }
+    
+
+
+    class GeoHopper;
+
+    class GeoPoint {
+    public:
+
+        GeoPoint() : _distance( -1 ), _exact( false ), _dirty( false )
+        {}
+
+        //// Distance not used ////
+
+        GeoPoint( const GeoKeyNode& node )
+            : _key( node._key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ), _distance( -1 ) , _exact( false ), _dirty( false ), _bucket( node._bucket ), _pos( node._keyOfs ) {
+        }
+
+        //// Immediate initialization of distance ////
+
+        GeoPoint( const GeoKeyNode& node, double distance, bool exact )
+            : _key( node._key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ), _distance( distance ), _exact( exact ), _dirty( false ) {
+        }
+
+        GeoPoint( const GeoPoint& pt, double distance, bool exact )
+            : _key( pt.key() ) , _loc( pt.loc() ) , _o( pt.obj() ), _distance( distance ), _exact( exact ), _dirty( false ) {
+        }
+
+        bool operator<( const GeoPoint& other ) const {
+            if( _distance != other._distance ) return _distance < other._distance;
+            if( _exact != other._exact ) return _exact < other._exact;
+            return _loc < other._loc;
+        }
+
+        double distance() const {
+            return _distance;
+        }
+
+        bool isExact() const {
+            return _exact;
+        }
+
+        BSONObj key() const {
+            return _key;
+        }
+
+        bool hasLoc() const {
+            return _loc.isNull();
+        }
+
+        DiskLoc loc() const {
+            assert( ! _dirty );
+            return _loc;
+        }
+
+        BSONObj obj() const {
+            return _o;
+        }
+
+        BSONObj pt() const {
+            return _pt;
+        }
+
+        bool isEmpty() {
+            return _o.isEmpty();
+        }
+
+        bool isCleanAndEmpty() {
+            return isEmpty() && ! isDirty();
+        }
+
+        string toString() const {
+            return str::stream() << "Point from " << _key << " - " << _o << " dist : " << _distance << ( _exact ? " (ex)" : " (app)" );
+        }
+
+
+        // TODO:  Recover from yield by finding all the changed disk locs here, modifying the _seenPts array.
+        // Not sure yet the correct thing to do about _seen.
+        // Definitely need to re-find our current max/min locations too
+        bool unDirty( const Geo2dType* g, DiskLoc& oldLoc ){
+
+            assert( _dirty );
+            assert( ! _id.isEmpty() );
+
+            oldLoc = _loc;
+            _loc = DiskLoc();
+
+            // Fast undirty
+            IndexInterface& ii = g->getDetails()->idxInterface();
+            // Check this position and the one immediately preceding
+            for( int i = 0; i < 2; i++ ){
+                if( _pos - i < 0 ) continue;
+
+                // log() << "bucket : " << _bucket << " pos " << _pos << endl;
+
+                BSONObj key;
+                DiskLoc loc;
+                ii.keyAt( _bucket, _pos - i, key, loc );
+
+                // log() << "Loc: " << loc << " Key : " << key << endl;
+
+                if( loc.isNull() ) continue;
+
+                if( key.binaryEqual( _key ) && loc.obj()["_id"].wrap( "" ).binaryEqual( _id ) ){
+                    _pos = _pos - i;
+                    _loc = loc;
+                    _dirty = false;
+                    _o = loc.obj();
+                    return true;
+                }
+            }
+
+            // Slow undirty
+            scoped_ptr<BtreeCursor> cursor( BtreeCursor::make( nsdetails( g->getDetails()->parentNS().c_str() ),
+                                            *( g->getDetails() ), _key, _key, true, 1 ) );
+
+            int count = 0;
+            while( cursor->ok() ){
+                count++;
+                if( cursor->current()["_id"].wrap( "" ).binaryEqual( _id ) ){
+                    _bucket = cursor->getBucket();
+                    _pos = cursor->getKeyOfs();
+                    _loc = cursor->currLoc();
+                    _o = _loc.obj();
+                    break;
+                }
+                else{
+                    LOG( CDEBUG + 1 ) << "Key doesn't match : " << cursor->current()["_id"] << " saved : " << _id << endl;
+                }
+                cursor->advance();
+            }
+
+            if( ! count ) { LOG( CDEBUG ) << "No key found for " << _key << endl; }
+
+            _dirty = false;
+
+            return _loc == oldLoc;
+        }
+
+        bool isDirty(){
+            return _dirty;
+        }
+
+        bool makeDirty(){
+            if( ! _dirty ){
+                assert( ! obj()["_id"].eoo() );
+                assert( ! _bucket.isNull() );
+                assert( _pos >= 0 );
+
+                if( _id.isEmpty() ){
+                    _id = obj()["_id"].wrap( "" ).getOwned();
+                }
+                _o = BSONObj();
+                _key = _key.getOwned();
+                _pt = _pt.getOwned();
+                _dirty = true;
+
+                return true;
+            }
+
+            return false;
+        }
+
+        BSONObj _key;
+        DiskLoc _loc;
+        BSONObj _o;
+        BSONObj _pt;
+
+        double _distance;
+        bool _exact;
+
+        BSONObj _id;
+        bool _dirty;
+        DiskLoc _bucket;
+        int _pos;
+    };
+
+    // GeoBrowse subclasses this
+    class GeoAccumulator {
+    public:
+        GeoAccumulator( const Geo2dType * g , const BSONObj& filter, bool uniqueDocs, bool needDistance )
+            : _g(g) ,
+              _lookedAt(0) ,
+              _matchesPerfd(0) ,
+              _objectsLoaded(0) ,
+              _pointsLoaded(0) ,
+              _found(0) ,
+              _uniqueDocs( uniqueDocs ) ,
+              _needDistance( needDistance )
+        {
+            if ( ! filter.isEmpty() ) {
+                _matcher.reset( new CoveredIndexMatcher( filter , g->keyPattern() ) );
+                GEODEBUG( "Matcher is now " << _matcher->docMatcher().toString() );
+            }
+        }
+
+        virtual ~GeoAccumulator() { }
+
+        enum KeyResult { BAD, BORDER, GOOD };
+
+        virtual void add( const GeoKeyNode& node ) {
+
+            GEODEBUG( "\t\t\t\t checking key " << node._key.toString() )
+
+            _lookedAt++;
+
+            ////
+            // Approximate distance check using key data
+            ////
+            double keyD = 0;
+            Point keyP( _g, GeoHash( node._key.firstElement(), _g->_bits ) );
+            KeyResult keyOk = approxKeyCheck( keyP, keyD );
+            if ( keyOk == BAD ) {
+                GEODEBUG( "\t\t\t\t bad distance : " << node.recordLoc.obj()  << "\t" << keyD );
+                return;
+            }
+            GEODEBUG( "\t\t\t\t good distance : " << node.recordLoc.obj()  << "\t" << keyD );
+
+            ////
+            // Check for match using other key (and potentially doc) criteria
+            ////
+            // Remember match results for each object
+            map<DiskLoc, bool>::iterator match = _matched.find( node.recordLoc );
+            bool newDoc = match == _matched.end();
+            if( newDoc ) {
+
+                GEODEBUG( "\t\t\t\t matching new doc with " << (_matcher ? _matcher->docMatcher().toString() : "(empty)" ) );
+
+                // matcher
+                MatchDetails details;
+                if ( _matcher.get() ) {
+                    bool good = _matcher->matchesWithSingleKeyIndex( node._key , node.recordLoc , &details );
+
+                    _matchesPerfd++;
+
+                    if ( details._loadedObject )
+                        _objectsLoaded++;
+
+                    if ( ! good ) {
+                        GEODEBUG( "\t\t\t\t didn't match : " << node.recordLoc.obj()["_id"] );
+                        _matched[ node.recordLoc ] = false;
+                        return;
+                    }
+                }
+
+                _matched[ node.recordLoc ] = true;
+
+                if ( ! details._loadedObject ) // don't double count
+                    _objectsLoaded++;
+
+            }
+            else if( !((*match).second) ) {
+                GEODEBUG( "\t\t\t\t previously didn't match : " << node.recordLoc.obj()["_id"] );
+                return;
+            }
+
+            ////
+            // Exact check with particular data fields
+            ////
+            // Can add multiple points
+            int diff = addSpecific( node , keyP, keyOk == BORDER, keyD, newDoc );
+            if( diff > 0 ) _found += diff;
+            else _found -= -diff;
+
+        }
+
+        virtual void getPointsFor( const BSONObj& key, const BSONObj& obj, vector< BSONObj >& locsForNode, bool allPoints = false ){
+
+            // Find all the location objects from the keys
+            vector< BSONObj > locs;
+            _g->getKeys( obj, allPoints ? locsForNode : locs );
+            _pointsLoaded++;
+
+            if( allPoints ) return;
+            if( locs.size() == 1 ){
+                locsForNode.push_back( locs[0] );
+                return;
+            }
+
+            // Find the particular location we want
+            GeoHash keyHash( key.firstElement(), _g->_bits );
+
+            // log() << "Hash: " << node.key << " and " << keyHash.getHash() << " unique " << _uniqueDocs << endl;
+            for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ) {
+
+                // Ignore all locations not hashed to the key's hash, since we may see
+                // those later
+                if( _g->_hash( *i ) != keyHash ) continue;
+
+                locsForNode.push_back( *i );
+
+            }
+
+        }
+
+        virtual int addSpecific( const GeoKeyNode& node, const Point& p , bool inBounds, double d, bool newDoc ) = 0;
+        virtual KeyResult approxKeyCheck( const Point& p , double& keyD ) = 0;
+        virtual bool exactDocCheck( const Point& p , double& d ) = 0;
+        virtual bool expensiveExactCheck(){ return false; }
+
+
+        long long found() const {
+            return _found;
+        }
+
+        const Geo2dType * _g;
+        map<DiskLoc, bool> _matched;
+        shared_ptr<CoveredIndexMatcher> _matcher;
+
+        long long _lookedAt;
+        long long _matchesPerfd;
+        long long _objectsLoaded;
+        long long _pointsLoaded;
+        long long _found;
+
+        bool _uniqueDocs;
+        bool _needDistance;
+
+    };
+
+
+    struct BtreeLocation {
+        BtreeLocation() { }
+
+        scoped_ptr<BtreeCursor> _cursor;
+        scoped_ptr<FieldRangeSet> _frs;
+        scoped_ptr<IndexSpec> _spec;
+
+        BSONObj key() {
+            return _cursor->currKey();
+        }
+
+        bool hasPrefix( const GeoHash& hash ) {
+            BSONObj k = key();
+            BSONElement e = k.firstElement();
+            if ( e.eoo() )
+                return false;
+            return GeoHash( e ).hasPrefix( hash );
+        }
+
+        bool checkAndAdvance( const GeoHash& hash, int& totalFound, GeoAccumulator* all ){
+            if( ! _cursor->ok() || ! hasPrefix( hash ) ) return false;
+
+            if( all ){
+                totalFound++;
+                GeoKeyNode n( _cursor->getBucket(), _cursor->getKeyOfs(), _cursor->currLoc(), _cursor->currKey() );
+                all->add( n );
+            }
+            _cursor->advance();
+
+            return true;
+        }
+
+        void save(){
+            _cursor->noteLocation();
+        }
+
+        void restore(){
+            _cursor->checkLocation();
+        }
+
+        string toString() {
+            stringstream ss;
+            ss << "bucket: " << _cursor->getBucket().toString() << " pos: " << _cursor->getKeyOfs() <<
+               ( _cursor->ok() ? ( str::stream() << " k: " << _cursor->currKey() << " o : " << _cursor->current()["_id"] ) : (string)"[none]" ) << endl;
+            return ss.str();
+        }
+
+        // Returns the min and max keys which bound a particular location.
+        // The only time these may be equal is when we actually equal the location
+        // itself, otherwise our expanding algorithm will fail.
+        static bool initial( const IndexDetails& id , const Geo2dType * spec ,
+                             BtreeLocation& min , BtreeLocation&  max ,
+                             GeoHash start ,
+                             int & found , GeoAccumulator * hopper ) {
+
+            //Ordering ordering = Ordering::make(spec->_order);
+
+            // Would be nice to build this directly, but bug in max/min queries SERVER-3766 and lack of interface
+            // makes this easiest for now.
+            BSONObj minQuery = BSON( spec->_geo << BSON( "$gt" << MINKEY << start.wrap( "$lte" ).firstElement() ) );
+            BSONObj maxQuery = BSON( spec->_geo << BSON( "$lt" << MAXKEY << start.wrap( "$gt" ).firstElement() ) );
+
+            // log() << "MinQuery: " << minQuery << endl;
+            // log() << "MaxQuery: " << maxQuery << endl;
+
+            min._frs.reset( new FieldRangeSet( spec->getDetails()->parentNS().c_str(),
+                                  minQuery,
+                                  true,
+                                  false ) );
+
+            max._frs.reset( new FieldRangeSet( spec->getDetails()->parentNS().c_str(),
+                                  maxQuery,
+                                  true,
+                                  false ) );
+
+
+            BSONObjBuilder bob;
+            bob.append( spec->_geo, 1 );
+            for( vector<string>::const_iterator i = spec->_other.begin(); i != spec->_other.end(); i++ ){
+                bob.append( *i, 1 );
+            }
+            BSONObj iSpec = bob.obj();
+
+            min._spec.reset( new IndexSpec( iSpec ) );
+            max._spec.reset( new IndexSpec( iSpec ) );
+
+            shared_ptr<FieldRangeVector> frvMin( new FieldRangeVector( *(min._frs), *(min._spec), -1 ) );
+            shared_ptr<FieldRangeVector> frvMax( new FieldRangeVector( *(max._frs), *(max._spec), 1 ) );
+
+            min._cursor.reset(
+                            BtreeCursor::make( nsdetails( spec->getDetails()->parentNS().c_str() ), *( spec->getDetails() ),
+                                               frvMin, -1 )
+                    );
+
+            max._cursor.reset(
+                           BtreeCursor::make( nsdetails( spec->getDetails()->parentNS().c_str() ), *( spec->getDetails() ),
+                                              frvMax, 1 )
+                   );
+
+            // if( hopper ) min.checkCur( found, hopper );
+            // if( hopper ) max.checkCur( found, hopper );
+
+            return min._cursor->ok() || max._cursor->ok();
+        }
+    };
+
+
+    class GeoCursorBase : public Cursor {
+    public:
+
+        static const shared_ptr< CoveredIndexMatcher > emptyMatcher;
+
+        GeoCursorBase( const Geo2dType * spec )
+            : _spec( spec ), _id( _spec->getDetails() ) {
+
+        }
+
+        virtual DiskLoc refLoc() { return DiskLoc(); }
+
+        virtual BSONObj indexKeyPattern() {
+            return _spec->keyPattern();
+        }
+
+        virtual void noteLocation() {
+            // no-op since these are meant to be safe
+        }
+
+        /* called before query getmore block is iterated */
+        virtual void checkLocation() {
+            // no-op since these are meant to be safe
+        }
+
+        virtual bool supportGetMore() { return false; }
+        virtual bool supportYields() { return false; }
+
+        virtual bool getsetdup(DiskLoc loc) { return false; }
+        virtual bool modifiedKeys() const { return true; }
+        virtual bool isMultiKey() const { return false; }
+
+        virtual bool autoDedup() const { return false; }
+
+        const Geo2dType * _spec;
+        const IndexDetails * _id;
+    };
+
+    const shared_ptr< CoveredIndexMatcher > GeoCursorBase::emptyMatcher( new CoveredIndexMatcher( BSONObj(), BSONObj(), false ) );
+
+    // TODO: Pull out the cursor bit from the browse, have GeoBrowse as field of cursor to clean up
+    // this hierarchy a bit.  Also probably useful to look at whether GeoAccumulator can be a member instead
+    // of a superclass.
+    class GeoBrowse : public GeoCursorBase , public GeoAccumulator {
+    public:
+
+        // The max points which should be added to an expanding box at one time
+        static const int maxPointsHeuristic = 50;
+
+        // Expand states
+        enum State {
+            START ,
+            DOING_EXPAND ,
+            DONE_NEIGHBOR ,
+            DONE
+        } _state;
+
+        GeoBrowse( const Geo2dType * g , string type , BSONObj filter = BSONObj(), bool uniqueDocs = true, bool needDistance = false )
+            : GeoCursorBase( g ), GeoAccumulator( g , filter, uniqueDocs, needDistance ) ,
+              _type( type ) , _filter( filter ) , _firstCall(true), _noted( false ), _nscanned(), _nDirtied(0), _nChangedOnYield(0), _nRemovedOnYield(0), _centerPrefix(0, 0, 0) {
+
+            // Set up the initial expand state
+            _state = START;
+            _neighbor = -1;
+            _foundInExp = 0;
+
+        }
+
+        virtual string toString() {
+            return (string)"GeoBrowse-" + _type;
+        }
+
+        virtual bool ok() {
+
+            bool filled = false;
+
+            LOG( CDEBUG ) << "Checking cursor, in state " << (int) _state << ", first call " << _firstCall <<
+                             ", empty : " << _cur.isEmpty() << ", dirty : " << _cur.isDirty() << ", stack : " << _stack.size() << endl;
+
+            bool first = _firstCall;
+            if ( _firstCall ) {
+                fillStack( maxPointsHeuristic );
+                filled = true;
+                _firstCall = false;
+            }
+            if ( ! _cur.isCleanAndEmpty() || _stack.size() ) {
+                if ( first ) {
+                    ++_nscanned;
+                }
+
+                if( _noted && filled ) noteLocation();
+                return true;
+            }
+
+            while ( moreToDo() ) {
+
+                LOG( CDEBUG ) << "Refilling stack..." << endl;
+
+                fillStack( maxPointsHeuristic );
+                filled = true;
+
+                if ( ! _cur.isCleanAndEmpty() ) {
+                    if ( first ) {
+                        ++_nscanned;
+                    }
+
+                    if( _noted && filled ) noteLocation();
+                    return true;
+                }
+            }
+
+            if( _noted && filled ) noteLocation();
+            return false;
+        }
+
+        virtual bool advance() {
+            _cur._o = BSONObj();
+
+            if ( _stack.size() ) {
+                _cur = _stack.front();
+                _stack.pop_front();
+                ++_nscanned;
+                return true;
+            }
+
+            if ( ! moreToDo() )
+                return false;
+
+            bool filled = false;
+            while ( _cur.isCleanAndEmpty() && moreToDo() ){
+                fillStack( maxPointsHeuristic );
+                filled = true;
+            }
+
+            if( _noted && filled ) noteLocation();
+            return ! _cur.isCleanAndEmpty() && ++_nscanned;
+        }
+
+        virtual void noteLocation() {
+            _noted = true;
+
+            LOG( CDEBUG ) << "Noting location with " << _stack.size() << ( _cur.isEmpty() ? "" : " + 1 " ) << " points " << endl;
+
+            // Make sure we advance past the point we're at now,
+            // since the current location may move on an update/delete
+            // if( _state == DOING_EXPAND ){
+            //     if( _min.hasPrefix( _prefix ) ){ _min.advance( -1, _foundInExp, this ); }
+            //    if( _max.hasPrefix( _prefix ) ){ _max.advance(  1, _foundInExp, this ); }
+            // }
+
+            // Remember where our _max, _min are
+            _min.save();
+            _max.save();
+
+            LOG( CDEBUG ) << "Min " << _min.toString() << endl;
+            LOG( CDEBUG ) << "Max " << _max.toString() << endl;
+
+            // Dirty all our queued stuff
+            for( list<GeoPoint>::iterator i = _stack.begin(); i != _stack.end(); i++ ){
+
+                LOG( CDEBUG ) << "Undirtying stack point with id " << i->_id << endl;
+
+                if( i->makeDirty() ) _nDirtied++;
+                assert( i->isDirty() );
+            }
+
+            // Check current item
+            if( ! _cur.isEmpty() ){
+                if( _cur.makeDirty() ) _nDirtied++;
+            }
+
+            // Our cached matches become invalid now
+            _matched.clear();
+        }
+
+        void fixMatches( DiskLoc oldLoc, DiskLoc newLoc ){
+            map<DiskLoc, bool>::iterator match = _matched.find( oldLoc );
+            if( match != _matched.end() ){
+                bool val = match->second;
+                _matched.erase( oldLoc );
+                _matched[ newLoc ] = val;
+            }
+        }
+
+        /* called before query getmore block is iterated */
+        virtual void checkLocation() {
+
+            LOG( CDEBUG ) << "Restoring location with " << _stack.size() << ( ! _cur.isDirty() ? "" : " + 1 " ) << " points " << endl;
+
+            // We can assume an error was thrown earlier if this database somehow disappears
+
+            // Recall our _max, _min
+            _min.restore();
+            _max.restore();
+
+            LOG( CDEBUG ) << "Min " << _min.toString() << endl;
+            LOG( CDEBUG ) << "Max " << _max.toString() << endl;
+
+            // If the current key moved, we may have been advanced past the current point - need to check this
+            // if( _state == DOING_EXPAND ){
+            //    if( _min.hasPrefix( _prefix ) ){ _min.advance( -1, _foundInExp, this ); }
+            //    if( _max.hasPrefix( _prefix ) ){ _max.advance(  1, _foundInExp, this ); }
+            //}
+
+            // Undirty all the queued stuff
+            // Dirty all our queued stuff
+            list<GeoPoint>::iterator i = _stack.begin();
+            while( i != _stack.end() ){
+
+                LOG( CDEBUG ) << "Undirtying stack point with id " << i->_id << endl;
+
+                DiskLoc oldLoc;
+                if( i->unDirty( _spec, oldLoc ) ){
+                    // Document is in same location
+                    LOG( CDEBUG ) << "Undirtied " << oldLoc << endl;
+
+                    i++;
+                }
+                else if( ! i->loc().isNull() ){
+
+                    // Re-found document somewhere else
+                    LOG( CDEBUG ) << "Changed location of " << i->_id << " : " << i->loc() << " vs " << oldLoc << endl;
+
+                    _nChangedOnYield++;
+                    fixMatches( oldLoc, i->loc() );
+                    i++;
+                }
+                else {
+
+                    // Can't re-find document
+                    LOG( CDEBUG ) << "Removing document " << i->_id << endl;
+
+                    _nRemovedOnYield++;
+                    _found--;
+                    assert( _found >= 0 );
+
+                    // Can't find our key again, remove
+                    i = _stack.erase( i );
+                }
+            }
+
+            if( _cur.isDirty() ){
+                LOG( CDEBUG ) << "Undirtying cur point with id : " << _cur._id << endl;
+            }
+
+            // Check current item
+            DiskLoc oldLoc;
+            if( _cur.isDirty() && ! _cur.unDirty( _spec, oldLoc ) ){
+                if( _cur.loc().isNull() ){
+
+                    // Document disappeared!
+                    LOG( CDEBUG ) << "Removing cur point " << _cur._id << endl;
+
+                    _nRemovedOnYield++;
+                    advance();
+                }
+                else{
+
+                    // Document moved
+                    LOG( CDEBUG ) << "Changed location of cur point " << _cur._id << " : " << _cur.loc() << " vs " << oldLoc << endl;
+
+                    _nChangedOnYield++;
+                    fixMatches( oldLoc, _cur.loc() );
+                }
+            }
+
+            _noted = false;
+        }
+
+        virtual Record* _current() { assert(ok()); LOG( CDEBUG + 1 ) << "_current " << _cur._loc.obj()["_id"] << endl; return _cur._loc.rec(); }
+        virtual BSONObj current() { assert(ok()); LOG( CDEBUG + 1 ) << "current " << _cur._o << endl; return _cur._o; }
+        virtual DiskLoc currLoc() { assert(ok()); LOG( CDEBUG + 1 ) << "currLoc " << _cur._loc << endl; return _cur._loc; }
+        virtual BSONObj currKey() const { return _cur._key; }
+
+        virtual CoveredIndexMatcher* matcher() const {
+            if( _matcher.get() ) return _matcher.get();
+            else return GeoCursorBase::emptyMatcher.get();
+        }
+
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const {
+            if( _matcher.get() ) return _matcher;
+            else return GeoCursorBase::emptyMatcher;
+        }
+
+        // Are we finished getting points?
+        virtual bool moreToDo() {
+            return _state != DONE;
+        }
+
+        virtual bool supportGetMore() { return true; }
+
+        // Fills the stack, but only checks a maximum number of maxToCheck points at a time.
+        // Further calls to this function will continue the expand/check neighbors algorithm.
+        virtual void fillStack( int maxToCheck, int maxToAdd = -1, bool onlyExpand = false ) {
+
+#ifdef GEODEBUGGING
+            log() << "Filling stack with maximum of " << maxToCheck << ", state : " << (int) _state << endl;
+#endif
+
+            if( maxToAdd < 0 ) maxToAdd = maxToCheck;
+            int maxFound = _foundInExp + maxToCheck;
+            assert( maxToCheck > 0 );
+            assert( maxFound > 0 );
+            assert( _found <= 0x7fffffff ); // conversion to int
+            int maxAdded = static_cast<int>(_found) + maxToAdd;
+            assert( maxAdded >= 0 ); // overflow check
+
+            bool isNeighbor = _centerPrefix.constrains();
+
+            // Starting a box expansion
+            if ( _state == START ) {
+
+                // Get the very first hash point, if required
+                if( ! isNeighbor )
+                    _prefix = expandStartHash();
+
+                GEODEBUG( "initializing btree" );
+
+#ifdef GEODEBUGGING
+                log() << "Initializing from b-tree with hash of " << _prefix << " @ " << Box( _g, _prefix ) << endl;
+#endif
+
+                if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , _prefix , _foundInExp , this ) )
+                    _state = isNeighbor ? DONE_NEIGHBOR : DONE;
+                else {
+                    _state = DOING_EXPAND;
+                    _lastPrefix.reset();
+                }
+
+                GEODEBUG( (_state == DONE_NEIGHBOR || _state == DONE ? "not initialized" : "initializedFig") );
+
+            }
+
+            // Doing the actual box expansion
+            if ( _state == DOING_EXPAND ) {
+
+                while ( true ) {
+
+                    GEODEBUG( "box prefix [" << _prefix << "]" );
+#ifdef GEODEBUGGING
+                    if( _prefix.constrains() ) {
+                        log() << "current expand box : " << Box( _g, _prefix ).toString() << endl;
+                    }
+                    else {
+                        log() << "max expand box." << endl;
+                    }
+#endif
+
+                    GEODEBUG( "expanding box points... ");
+
+                    // Record the prefix we're actively exploring...
+                    _expPrefix.reset( new GeoHash( _prefix ) );
+
+                    // Find points inside this prefix
+                    while ( _min.checkAndAdvance( _prefix, _foundInExp, this ) && _foundInExp < maxFound && _found < maxAdded );
+                    while ( _max.checkAndAdvance( _prefix, _foundInExp, this ) && _foundInExp < maxFound && _found < maxAdded );
+
+#ifdef GEODEBUGGING
+
+                    log() << "finished expand, checked : " << ( maxToCheck - ( maxFound - _foundInExp ) )
+                          << " found : " << ( maxToAdd - ( maxAdded - _found ) )
+                          << " max : " << maxToCheck << " / " << maxToAdd << endl;
+
+#endif
+
+                    GEODEBUG( "finished expand, found : " << ( maxToAdd - ( maxAdded - _found ) ) );
+                    if( _foundInExp >= maxFound || _found >= maxAdded ) return;
+
+                    // We've searched this prefix fully, remember
+                    _lastPrefix.reset( new GeoHash( _prefix ));
+
+                    // If we've searched the entire space, we're finished.
+                    if ( ! _prefix.constrains() ) {
+                        GEODEBUG( "box exhausted" );
+                        _state = DONE;
+                        notePrefix();
+                        return;
+                    }
+
+                    // If we won't fit in the box, and we're not doing a sub-scan, increase the size
+                    if ( ! fitsInBox( _g->sizeEdge( _prefix ) ) && _fringe.size() == 0 ) {
+
+                        // If we're still not expanded bigger than the box size, expand again
+                        // TODO: Is there an advantage to scanning prior to expanding?
+                        _prefix = _prefix.up();
+                        continue;
+
+                    }
+
+                    // log() << "finished box prefix [" << _prefix << "]" << endl;
+
+                    // We're done and our size is large enough
+                    _state = DONE_NEIGHBOR;
+
+                    // Go to the next sub-box, if applicable
+                    if( _fringe.size() > 0 ) _fringe.pop_back();
+                    // Go to the next neighbor if this was the last sub-search
+                    if( _fringe.size() == 0 ) _neighbor++;
+
+                    break;
+
+                }
+
+                notePrefix();
+            }
+
+            // If we doeighbors
+            if( onlyExpand ) return;
+
+            // If we're done expanding the current box...
+            if( _state == DONE_NEIGHBOR ) {
+
+                // Iterate to the next neighbor
+                // Loop is useful for cases where we want to skip over boxes entirely,
+                // otherwise recursion increments the neighbors.
+                for ( ; _neighbor < 9; _neighbor++ ) {
+
+                    // If we have no fringe for the neighbor, make sure we have the default fringe
+                    if( _fringe.size() == 0 ) _fringe.push_back( "" );
+
+                    if( ! isNeighbor ) {
+                        _centerPrefix = _prefix;
+                        _centerBox = Box( _g, _centerPrefix );
+                        isNeighbor = true;
+                    }
+
+                    int i = (_neighbor / 3) - 1;
+                    int j = (_neighbor % 3) - 1;
+
+                    if ( ( i == 0 && j == 0 ) ||
+                         ( i < 0 && _centerPrefix.atMinX() ) ||
+                         ( i > 0 && _centerPrefix.atMaxX() ) ||
+                         ( j < 0 && _centerPrefix.atMinY() ) ||
+                         ( j > 0 && _centerPrefix.atMaxY() ) ) {
+
+                        //log() << "not moving to neighbor " << _neighbor << " @ " << i << " , " << j << " fringe : " << _fringe.size() << " " << _centerPrefix << endl;
+                        //log() << _centerPrefix.atMinX() << " "
+                        //        << _centerPrefix.atMinY() << " "
+                        //        << _centerPrefix.atMaxX() << " "
+                        //        << _centerPrefix.atMaxY() << " " << endl;
+
+                        continue; // main box or wrapped edge
+                        // TODO:  We may want to enable wrapping in future, probably best as layer on top of
+                        // this search.
+                    }
+
+                    // Make sure we've got a reasonable center
+                    assert( _centerPrefix.constrains() );
+
+                    GeoHash _neighborPrefix = _centerPrefix;
+                    _neighborPrefix.move( i, j );
+
+                    //log() << "moving to neighbor " << _neighbor << " @ " << i << " , " << j << " fringe : " << _fringe.size() << " " << _centerPrefix << " " << _neighborPrefix << endl;
+
+                    GEODEBUG( "moving to neighbor " << _neighbor << " @ " << i << " , " << j << " fringe : " << _fringe.size() );
+                    PREFIXDEBUG( _centerPrefix, _g );
+                    PREFIXDEBUG( _neighborPrefix , _g );
+                    while( _fringe.size() > 0 ) {
+
+                        _prefix = _neighborPrefix + _fringe.back();
+                        Box cur( _g , _prefix );
+
+                        PREFIXDEBUG( _prefix, _g );
+
+                        double intAmt = intersectsBox( cur );
+
+                        // No intersection
+                        if( intAmt <= 0 ) {
+                            GEODEBUG( "skipping box" << cur.toString() );
+                            _fringe.pop_back();
+                            continue;
+                        }
+                        // Small intersection, refine search
+                        else if( intAmt < 0.5 && _prefix.canRefine() && _fringe.back().size() < 4 /* two bits */ ) {
+
+                            GEODEBUG( "Intersection small : " << intAmt << ", adding to fringe: " << _fringe.back() << " curr prefix : " << _prefix << " bits : " << _prefix.getBits() );
+
+                            // log() << "Diving to level : " << ( _fringe.back().size() / 2 + 1 ) << endl;
+
+                            string lastSuffix = _fringe.back();
+                            _fringe.pop_back();
+                            _fringe.push_back( lastSuffix + "00" );
+                            _fringe.push_back( lastSuffix + "01" );
+                            _fringe.push_back( lastSuffix + "11" );
+                            _fringe.push_back( lastSuffix + "10" );
+
+                            continue;
+                        }
+
+                        // Restart our search from a diff box.
+                        _state = START;
+
+                        assert( ! onlyExpand );
+
+                        assert( _found <= 0x7fffffff );
+                        fillStack( maxFound - _foundInExp, maxAdded - static_cast<int>(_found) );
+
+                        // When we return from the recursive fillStack call, we'll either have checked enough points or
+                        // be entirely done.  Max recurse depth is < 8 * 16.
+
+                        // If we're maxed out on points, return
+                        if( _foundInExp >= maxFound || _found >= maxAdded ) {
+                            // Make sure we'll come back to add more points
+                            assert( _state == DOING_EXPAND );
+                            return;
+                        }
+
+                        // Otherwise we must be finished to return
+                        assert( _state == DONE );
+                        return;
+
+                    }
+
+                }
+
+                // Finished with neighbors
+                _state = DONE;
+            }
+
+        }
+
+        // The initial geo hash box for our first expansion
+        virtual GeoHash expandStartHash() = 0;
+
+        // Whether the current box width is big enough for our search area
+        virtual bool fitsInBox( double width ) = 0;
+
+        // The amount the current box overlaps our search area
+        virtual double intersectsBox( Box& cur ) = 0;
+
+        bool remembered( BSONObj o ){
+            BSONObj seenId = o["_id"].wrap("").getOwned();
+            if( _seenIds.find( seenId ) != _seenIds.end() ){
+                LOG( CDEBUG + 1 ) << "Object " << o["_id"] << " already seen." << endl;
+                return true;
+            }
+            else{
+                _seenIds.insert( seenId );
+                LOG( CDEBUG + 1 ) << "Object " << o["_id"] << " remembered." << endl;
+                return false;
+            }
+        }
+
+        virtual int addSpecific( const GeoKeyNode& node , const Point& keyP , bool onBounds , double keyD , bool potentiallyNewDoc ) {
+
+            int found = 0;
+
+            // We need to handle every possible point in this method, even those not in the key value, to
+            // avoid us tracking which hashes we've already seen.
+            if( ! potentiallyNewDoc ){
+                // log() << "Already handled doc!" << endl;
+                return 0;
+            }
+
+            // Final check for new doc
+            // OK to touch, since we're probably returning this object now
+            if( remembered( node.recordLoc.obj() ) ) return 0;
+
+            if( _uniqueDocs && ! onBounds ) {
+                //log() << "Added ind to " << _type << endl;
+                _stack.push_front( GeoPoint( node ) );
+                found++;
+            }
+            else {
+                // We now handle every possible point in the document, even those not in the key value,
+                // since we're iterating through them anyway - prevents us from having to save the hashes
+                // we've seen per-doc
+
+                // If we're filtering by hash, get the original
+                bool expensiveExact = expensiveExactCheck();
+
+                vector< BSONObj > locs;
+                getPointsFor( node._key, node.recordLoc.obj(), locs, true );
+                for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ){
+
+                    double d = -1;
+                    Point p( *i );
+
+                    // We can avoid exact document checks by redoing approx checks,
+                    // if the exact checks are more expensive.
+                    bool needExact = true;
+                    if( expensiveExact ){
+                        assert( false );
+                        KeyResult result = approxKeyCheck( p, d );
+                        if( result == BAD ) continue;
+                        else if( result == GOOD ) needExact = false;
+                    }
+
+                    if( ! needExact || exactDocCheck( p, d ) ){
+                        //log() << "Added mult to " << _type << endl;
+                        _stack.push_front( GeoPoint( node ) );
+                        found++;
+                        // If returning unique, just exit after first point is added
+                        if( _uniqueDocs ) break;
+                    }
+                }
+            }
+
+            while( _cur.isCleanAndEmpty() && _stack.size() > 0 ){
+                _cur = _stack.front();
+                _stack.pop_front();
+            }
+
+            return found;
+        }
+
+        virtual long long nscanned() {
+            if ( _firstCall ) {
+                ok();
+            }
+            return _nscanned;
+        }
+
+        virtual void explainDetails( BSONObjBuilder& b ){
+            b << "lookedAt" << _lookedAt;
+            b << "matchesPerfd" << _matchesPerfd;
+            b << "objectsLoaded" << _objectsLoaded;
+            b << "pointsLoaded" << _pointsLoaded;
+            b << "pointsSavedForYield" << _nDirtied;
+            b << "pointsChangedOnYield" << _nChangedOnYield;
+            b << "pointsRemovedOnYield" << _nRemovedOnYield;
+        }
+
+        virtual BSONObj prettyIndexBounds() const {
+
+            vector<GeoHash>::const_iterator i = _expPrefixes.end();
+            if( _expPrefixes.size() > 0 && *(--i) != *( _expPrefix.get() ) )
+                _expPrefixes.push_back( *( _expPrefix.get() ) );
+
+            BSONObjBuilder bob;
+            BSONArrayBuilder bab;
+            for( i = _expPrefixes.begin(); i != _expPrefixes.end(); ++i ){
+                bab << Box( _g, *i ).toBSON();
+            }
+            bob << _g->_geo << bab.arr();
+
+            return bob.obj();
+
+        }
+
+        void notePrefix() {
+            _expPrefixes.push_back( _prefix );
+        }
+
+        string _type;
+        BSONObj _filter;
+        list<GeoPoint> _stack;
+        set<BSONObj> _seenIds;
+
+        GeoPoint _cur;
+        bool _firstCall;
+        bool _noted;
+
+        long long _nscanned;
+        long long _nDirtied;
+        long long _nChangedOnYield;
+        long long _nRemovedOnYield;
+
+        // The current box we're expanding (-1 is first/center box)
+        int _neighbor;
+
+        // The points we've found so far
+        // TODO:  Long long?
+        int _foundInExp;
+
+        // The current hash prefix we're expanding and the center-box hash prefix
+        GeoHash _prefix;
+        shared_ptr<GeoHash> _lastPrefix;
+        GeoHash _centerPrefix;
+        list<string> _fringe;
+        int recurseDepth;
+        Box _centerBox;
+
+        // Start and end of our search range in the current box
+        BtreeLocation _min;
+        BtreeLocation _max;
+
+        shared_ptr<GeoHash> _expPrefix;
+        mutable vector<GeoHash> _expPrefixes;
+
+    };
+
+
+    class GeoHopper : public GeoBrowse {
+    public:
+        typedef multiset<GeoPoint> Holder;
+
+        GeoHopper( const Geo2dType * g , unsigned max , const Point& n , const BSONObj& filter = BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN, bool uniqueDocs = false, bool needDistance = true )
+            : GeoBrowse( g, "search", filter, uniqueDocs, needDistance ), _max( max ) , _near( n ), _maxDistance( maxDistance ), _type( type ), _distError( type == GEO_PLAIN ? g->_error : g->_errorSphere ), _farthest(0)
+        {}
+
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+
+            // Always check approximate distance, since it lets us avoid doing
+            // checks of the rest of the object if it succeeds
+
+            switch (_type) {
+            case GEO_PLAIN:
+                d = _near.distance( p );
+                break;
+            case GEO_SPHERE:
+                checkEarthBounds( p );
+                d = spheredist_deg( _near, p );
+                break;
+            default: assert( false );
+            }
+            assert( d >= 0 );
+
+            GEODEBUG( "\t\t\t\t\t\t\t checkDistance " << _near.toString()
+                      << "\t" << p.toString() << "\t" << d
+                      << " farthest: " << farthest() );
+
+            // If we need more points
+            double borderDist = ( _points.size() < _max ? _maxDistance : farthest() );
+
+            if( d >= borderDist - 2 * _distError && d <= borderDist + 2 * _distError ) return BORDER;
+            else return d < borderDist ? GOOD : BAD;
+
+        }
+
+        virtual bool exactDocCheck( const Point& p, double& d ){
+
+            bool within = false;
+
+            // Get the appropriate distance for the type
+            switch ( _type ) {
+            case GEO_PLAIN:
+                d = _near.distance( p );
+                within = _near.distanceWithin( p, _maxDistance );
+                break;
+            case GEO_SPHERE:
+                checkEarthBounds( p );
+                d = spheredist_deg( _near, p );
+                within = ( d <= _maxDistance );
+                break;
+            default: assert( false );
+            }
+
+            return within;
+        }
+
+        // Always in distance units, whether radians or normal
+        double farthest() const {
+            return _farthest;
+        }
+
+        virtual int addSpecific( const GeoKeyNode& node, const Point& keyP, bool onBounds, double keyD, bool potentiallyNewDoc ) {
+
+            // Unique documents
+
+            GeoPoint newPoint( node, keyD, false );
+
+            int prevSize = _points.size();
+
+            // STEP 1 : Remove old duplicate points from the set if needed
+            if( _uniqueDocs ){
+
+                // Lookup old point with same doc
+                map< DiskLoc , Holder::iterator >::iterator oldPointIt = _seenPts.find( newPoint.loc() );
+
+                if( oldPointIt != _seenPts.end() ){
+                    const GeoPoint& oldPoint = *(oldPointIt->second);
+                    // We don't need to care if we've already seen this same approx pt or better,
+                    // or we've already gone to disk once for the point
+                    if( oldPoint < newPoint ){
+                        GEODEBUG( "\t\tOld point closer than new point" );
+                        return 0;
+                    }
+                    GEODEBUG( "\t\tErasing old point " << oldPointIt->first.obj() );
+                    _points.erase( oldPointIt->second );
+                }
+            }
+
+            Holder::iterator newIt = _points.insert( newPoint );
+            if( _uniqueDocs ) _seenPts[ newPoint.loc() ] = newIt;
+
+            GEODEBUG( "\t\tInserted new point " << newPoint.toString() << " approx : " << keyD );
+
+            assert( _max > 0 );
+
+            Holder::iterator lastPtIt = _points.end();
+            lastPtIt--;
+            _farthest = lastPtIt->distance() + 2 * _distError;
+
+            return _points.size() - prevSize;
+
+        }
+
+        // Removes extra points from end of _points set.
+        // Check can be a bit costly if we have lots of exact points near borders,
+        // so we'll do this every once and awhile.
+        void processExtraPoints(){
+
+            if( _points.size() == 0 ) return;
+
+            int prevSize = _points.size();
+
+            // Erase all points from the set with a position >= _max *and*
+            // whose distance isn't close to the _max - 1 position distance
+
+            int numToErase = _points.size() - _max;
+            if( numToErase < 0 ) numToErase = 0;
+
+            // Get the first point definitely in the _points array
+            Holder::iterator startErase = _points.end();
+            for( int i = 0; i < numToErase + 1; i++ ) startErase--;
+            _farthest = startErase->distance() + 2 * _distError;
+
+            GEODEBUG( "\t\tPotentially erasing " << numToErase << " points, " << " size : " << _points.size() << " max : " << _max << " dist : " << startErase->distance() << " farthest dist : " << _farthest << " from error : " << _distError );
+
+            startErase++;
+            while( numToErase > 0 && startErase->distance() <= _farthest ){
+                GEODEBUG( "\t\tNot erasing point " << startErase->toString() );
+                numToErase--;
+                startErase++;
+                assert( startErase != _points.end() || numToErase == 0 );
+            }
+
+            if( _uniqueDocs ){
+                for( Holder::iterator i = startErase; i != _points.end(); ++i )
+                    _seenPts.erase( i->loc() );
+            }
+
+            _points.erase( startErase, _points.end() );
+
+            int diff = _points.size() - prevSize;
+            if( diff > 0 ) _found += diff;
+            else _found -= -diff;
+
+        }
+
+        unsigned _max;
+        Point _near;
+        Holder _points;
+        double _maxDistance;
+        GeoDistType _type;
+        double _distError;
+        double _farthest;
+
+        // Safe to use currently since we don't yield in $near searches.  If we do start to yield, we may need to
+        // replace dirtied disklocs in our holder / ensure our logic is correct.
+        map< DiskLoc , Holder::iterator > _seenPts;
+
+    };
+
+
+
+    class GeoSearch : public GeoHopper {
+    public:
+        GeoSearch( const Geo2dType * g , const Point& startPt , int numWanted=100 , BSONObj filter=BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN, bool uniqueDocs = false, bool needDistance = false )
+           : GeoHopper( g , numWanted , startPt , filter , maxDistance, type, uniqueDocs, needDistance ),
+             _start( g->hash( startPt._x, startPt._y ) ),
+             // TODO:  Remove numWanted...
+             _numWanted( numWanted ),
+             _type(type)
+        {
+
+           assert( g->getDetails() );
+            _nscanned = 0;
+            _found = 0;
+
+            if( _maxDistance < 0 ){
+               _scanDistance = numeric_limits<double>::max();
+            }
+            else if (type == GEO_PLAIN) {
+                _scanDistance = maxDistance + _spec->_error;
+            }
+            else if (type == GEO_SPHERE) {
+                checkEarthBounds( startPt );
+                // TODO: consider splitting into x and y scan distances
+                _scanDistance = computeXScanDistance( startPt._y, rad2deg( _maxDistance ) + _spec->_error );
+            }
+
+            assert( _scanDistance > 0 );
+
+        }
+
+
+        /** Check if we've already looked at a key.  ALSO marks as seen, anticipating a follow-up call
+            to add().  This is broken out to avoid some work extracting the key bson if it's an
+            already seen point.
+        */
+    private:
+        set< pair<DiskLoc,int> > _seen;
+    public:
+
+        void exec() {
+
+            if( _numWanted == 0 ) return;
+
+            /*
+             * Search algorithm
+             * 1) use geohash prefix to find X items
+             * 2) compute max distance from want to an item
+             * 3) find optimal set of boxes that complete circle
+             * 4) use regular btree cursors to scan those boxes
+             */
+
+#ifdef GEODEBUGGING
+
+           log() << "start near search for " << _numWanted << " points near " << _near << " (max dist " << _maxDistance << ")" << endl;
+
+#endif
+
+           // Part 1
+           {
+               do {
+                   long long f = found();
+                   assert( f <= 0x7fffffff );
+                   fillStack( maxPointsHeuristic, _numWanted - static_cast<int>(f) , true );
+                   processExtraPoints();
+               } while( _state != DONE && _state != DONE_NEIGHBOR &&
+                        found() < _numWanted &&
+                        (! _prefix.constrains() || _g->sizeEdge( _prefix ) <= _scanDistance ) );
+
+               // If we couldn't scan or scanned everything, we're done
+               if( _state == DONE ){
+                   expandEndPoints();
+                   return;
+               }
+           }
+
+#ifdef GEODEBUGGING
+
+           log() << "part 1 of near search completed, found " << found() << " points (out of " << _foundInExp << " scanned)"
+                 << " in expanded region " << _prefix << " @ " << Box( _g, _prefix )
+                 << " with furthest distance " << farthest() << endl;
+
+#endif
+
+           // Part 2
+            {
+
+               // Find farthest distance for completion scan
+                double farDist = farthest();
+                if( found() < _numWanted ) {
+                    // Not enough found in Phase 1
+                    farDist = _scanDistance;
+                }
+                else if ( _type == GEO_PLAIN ) {
+                   // Enough found, but need to search neighbor boxes
+                    farDist += _spec->_error;
+                }
+                else if ( _type == GEO_SPHERE ) {
+                   // Enough found, but need to search neighbor boxes
+                    farDist = std::min( _scanDistance, computeXScanDistance( _near._y, rad2deg( farDist ) ) + 2 * _spec->_error );
+                }
+                assert( farDist >= 0 );
+                GEODEBUGPRINT( farDist );
+
+                // Find the box that includes all the points we need to return
+                _want = Box( _near._x - farDist , _near._y - farDist , farDist * 2 );
+                GEODEBUGPRINT( _want.toString() );
+
+                // log() << "Found : " << found() << " wanted : " << _numWanted << " Far distance : " << farDist << " box : " << _want << endl;
+
+                // Remember the far distance for further scans
+                _scanDistance = farDist;
+
+                // Reset the search, our distances have probably changed
+                if( _state == DONE_NEIGHBOR ){
+                   _state = DOING_EXPAND;
+                   _neighbor = -1;
+                }
+
+#ifdef GEODEBUGGING
+
+                log() << "resetting search with start at " << _start << " (edge length " << _g->sizeEdge( _start ) << ")" << endl;
+
+#endif
+
+                // Do regular search in the full region
+                do {
+                   fillStack( maxPointsHeuristic );
+                   processExtraPoints();
+                }
+                while( _state != DONE );
+
+            }
+
+            GEODEBUG( "done near search with " << _points.size() << " points " );
+
+            expandEndPoints();
+
+        }
+
+        void addExactPoints( const GeoPoint& pt, Holder& points, bool force ){
+            int before, after;
+            addExactPoints( pt, points, before, after, force );
+        }
+
+        void addExactPoints( const GeoPoint& pt, Holder& points, int& before, int& after, bool force ){
+
+            before = 0;
+            after = 0;
+
+            GEODEBUG( "Adding exact points for " << pt.toString() );
+
+            if( pt.isExact() ){
+                if( force ) points.insert( pt );
+                return;
+            }
+
+            vector<BSONObj> locs;
+            getPointsFor( pt.key(), pt.obj(), locs, _uniqueDocs );
+
+            GeoPoint nearestPt( pt, -1, true );
+
+            for( vector<BSONObj>::iterator i = locs.begin(); i != locs.end(); i++ ){
+
+                Point loc( *i );
+
+                double d;
+                if( ! exactDocCheck( loc, d ) ) continue;
+
+                if( _uniqueDocs && ( nearestPt.distance() < 0 || d < nearestPt.distance() ) ){
+                    nearestPt._distance = d;
+                    nearestPt._pt = *i;
+                    continue;
+                }
+                else if( ! _uniqueDocs ){
+                    GeoPoint exactPt( pt, d, true );
+                    exactPt._pt = *i;
+                    GEODEBUG( "Inserting exact pt " << exactPt.toString() << " for " << pt.toString() << " exact : " << d << " is less? " << ( exactPt < pt ) << " bits : " << _g->_bits );
+                    points.insert( exactPt );
+                    exactPt < pt ? before++ : after++;
+                }
+
+            }
+
+            if( _uniqueDocs && nearestPt.distance() >= 0 ){
+                GEODEBUG( "Inserting unique exact pt " << nearestPt.toString() << " for " << pt.toString() << " exact : " << nearestPt.distance() << " is less? " << ( nearestPt < pt ) << " bits : " << _g->_bits );
+                points.insert( nearestPt );
+                if( nearestPt < pt ) before++;
+                else after++;
+            }
+
+        }
+
+        // TODO: Refactor this back into holder class, allow to run periodically when we are seeing a lot of pts
+        void expandEndPoints( bool finish = true ){
+
+            processExtraPoints();
+
+            // All points in array *could* be in maxDistance
+
+            // Step 1 : Trim points to max size
+            // TODO:  This check will do little for now, but is skeleton for future work in incremental $near
+            // searches
+            if( _max > 0 ){
+
+                int numToErase = _points.size() - _max;
+
+                if( numToErase > 0 ){
+
+                    Holder tested;
+
+                    // Work backward through all points we're not sure belong in the set
+                    Holder::iterator maybePointIt = _points.end();
+                    maybePointIt--;
+                    double approxMin = maybePointIt->distance() - 2 * _distError;
+
+                    GEODEBUG( "\t\tNeed to erase " << numToErase << " max : " << _max << " min dist " << approxMin << " error : " << _distError << " starting from : " << (*maybePointIt).toString() );
+
+                    // Insert all
+                    int erased = 0;
+                    while( _points.size() > 0 && ( maybePointIt->distance() >= approxMin || erased < numToErase ) ){
+
+                        Holder::iterator current = maybePointIt--;
+
+                        addExactPoints( *current, tested, true );
+                        _points.erase( current );
+                        erased++;
+
+                        if( tested.size() )
+                            approxMin = tested.begin()->distance() - 2 * _distError;
+
+                    }
+
+                    GEODEBUG( "\t\tEnding search at point " << ( _points.size() == 0 ? "(beginning)" : maybePointIt->toString() ) );
+
+                    int numToAddBack = erased - numToErase;
+                    assert( numToAddBack >= 0 );
+
+                    GEODEBUG( "\t\tNum tested valid : " << tested.size() << " erased : " << erased << " added back : " << numToAddBack );
+
+#ifdef GEODEBUGGING
+                    for( Holder::iterator it = tested.begin(); it != tested.end(); it++ ){
+                        log() << "Tested Point: " << *it << endl;
+                    }
+#endif
+                    Holder::iterator testedIt = tested.begin();
+                    for( int i = 0; i < numToAddBack && testedIt != tested.end(); i++ ){
+                        _points.insert( *testedIt );
+                        testedIt++;
+                    }
+                }
+            }
+
+#ifdef GEODEBUGGING
+            for( Holder::iterator it = _points.begin(); it != _points.end(); it++ ){
+                log() << "Point: " << *it << endl;
+            }
+#endif
+            // We've now trimmed first set of unneeded points
+
+            GEODEBUG( "\t\t Start expanding, num points : " << _points.size() << " max : " << _max );
+
+            // Step 2: iterate through all points and add as needed
+
+            unsigned expandedPoints = 0;
+            Holder::iterator it = _points.begin();
+            double expandWindowEnd = -1;
+            while( it != _points.end() ){
+                const GeoPoint& currPt = *it;
+
+                // TODO: If one point is exact, maybe not 2 * _distError
+
+                // See if we're in an expand window
+                bool inWindow = currPt.distance() <= expandWindowEnd;
+                // If we're not, and we're done with points, break
+                if( ! inWindow && expandedPoints >= _max ) break;
+
+                bool expandApprox = ! currPt.isExact() && ( ! _uniqueDocs || ( finish && _needDistance ) || inWindow );
+
+                if( expandApprox ){
+
+                    // Add new point(s)
+                    // These will only be added in a radius of 2 * _distError around the current point,
+                    // so should not affect previously valid points.
+                    int before, after;
+                    addExactPoints( currPt, _points, before, after, false );
+                    expandedPoints += before;
+
+                    if( _max > 0 && expandedPoints < _max )
+                        expandWindowEnd = currPt.distance() + 2 * _distError;
+
+                    // Iterate to the next point
+                    Holder::iterator current = it++;
+                    // Erase the current point
+                    _points.erase( current );
+
+                }
+                else{
+                    expandedPoints++;
+                    it++;
+                }
+            }
+
+            GEODEBUG( "\t\tFinished expanding, num points : " << _points.size() << " max : " << _max );
+
+            // Finish
+            // TODO:  Don't really need to trim?
+            for( ; expandedPoints > _max; expandedPoints-- ) it--;
+            _points.erase( it, _points.end() );
+
+#ifdef GEODEBUGGING
+            for( Holder::iterator it = _points.begin(); it != _points.end(); it++ ){
+                log() << "Point: " << *it << endl;
+            }
+#endif
+        }
+
+        virtual GeoHash expandStartHash(){
+           return _start;
+        }
+
+        // Whether the current box width is big enough for our search area
+        virtual bool fitsInBox( double width ){
+           return width >= _scanDistance;
+        }
+
+        // Whether the current box overlaps our search area
+        virtual double intersectsBox( Box& cur ){
+            return cur.intersects( _want );
+        }
+
+        GeoHash _start;
+        int _numWanted;
+        double _scanDistance;
+
+        long long _nscanned;
+        int _found;
+        GeoDistType _type;
+
+        Box _want;
+    };
+
+    class GeoSearchCursor : public GeoCursorBase {
+    public:
+
+        GeoSearchCursor( shared_ptr<GeoSearch> s )
+            : GeoCursorBase( s->_spec ) ,
+              _s( s ) , _cur( s->_points.begin() ) , _end( s->_points.end() ), _nscanned() {
+            if ( _cur != _end ) {
+                ++_nscanned;
+            }
+        }
+
+        virtual ~GeoSearchCursor() {}
+
+        virtual bool ok() {
+            return _cur != _end;
+        }
+
+        virtual Record* _current() { assert(ok()); return _cur->_loc.rec(); }
+        virtual BSONObj current() { assert(ok()); return _cur->_o; }
+        virtual DiskLoc currLoc() { assert(ok()); return _cur->_loc; }
+        virtual bool advance() {
+            if( ok() ){
+                _cur++;
+                incNscanned();
+                return ok();
+            }
+            return false;
+        }
+        virtual BSONObj currKey() const { return _cur->_key; }
+
+        virtual string toString() {
+            return "GeoSearchCursor";
+        }
+
+
+        virtual BSONObj prettyStartKey() const {
+            return BSON( _s->_g->_geo << _s->_prefix.toString() );
+        }
+        virtual BSONObj prettyEndKey() const {
+            GeoHash temp = _s->_prefix;
+            temp.move( 1 , 1 );
+            return BSON( _s->_g->_geo << temp.toString() );
+        }
+
+        virtual long long nscanned() { return _nscanned; }
+
+        virtual CoveredIndexMatcher* matcher() const {
+            if( _s->_matcher.get() ) return _s->_matcher.get();
+            else return emptyMatcher.get();
+        }
+
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const {
+            if( _s->_matcher.get() ) return _s->_matcher;
+            else return emptyMatcher;
+        }
+
+        shared_ptr<GeoSearch> _s;
+        GeoHopper::Holder::iterator _cur;
+        GeoHopper::Holder::iterator _end;
+
+        void incNscanned() { if ( ok() ) { ++_nscanned; } }
+        long long _nscanned;
+    };
+
+    class GeoCircleBrowse : public GeoBrowse {
+    public:
+
+        GeoCircleBrowse( const Geo2dType * g , const BSONObj& circle , BSONObj filter = BSONObj() , const string& type="$center", bool uniqueDocs = true )
+            : GeoBrowse( g , "circle" , filter, uniqueDocs ) {
+
+            uassert( 13060 , "$center needs 2 fields (middle,max distance)" , circle.nFields() == 2 );
+
+            BSONObjIterator i(circle);
+            BSONElement center = i.next();
+
+            uassert( 13656 , "the first field of $center object must be a location object" , center.isABSONObj() );
+
+            // Get geohash and exact center point
+            // TODO: For wrapping search, may be useful to allow center points outside-of-bounds here.
+            // Calculating the nearest point as a hash start inside the region would then be required.
+            _start = g->_tohash(center);
+            _startPt = Point(center);
+
+            _maxDistance = i.next().numberDouble();
+            uassert( 13061 , "need a max distance >= 0 " , _maxDistance >= 0 );
+
+            if (type == "$center") {
+                // Look in box with bounds of maxDistance in either direction
+                _type = GEO_PLAIN;
+                _xScanDistance = _maxDistance + _g->_error;
+                _yScanDistance = _maxDistance + _g->_error;
+            }
+            else if (type == "$centerSphere") {
+                // Same, but compute maxDistance using spherical transform
+
+                uassert(13461, "Spherical MaxDistance > PI. Are you sure you are using radians?", _maxDistance < M_PI);
+                checkEarthBounds( _startPt );
+
+                _type = GEO_SPHERE;
+                _yScanDistance = rad2deg( _maxDistance ) + _g->_error;
+                _xScanDistance = computeXScanDistance(_startPt._y, _yScanDistance);
+
+                uassert(13462, "Spherical distance would require wrapping, which isn't implemented yet",
+                        (_startPt._x + _xScanDistance < 180) && (_startPt._x - _xScanDistance > -180) &&
+                        (_startPt._y + _yScanDistance < 90) && (_startPt._y - _yScanDistance > -90));
+            }
+            else {
+                uassert(13460, "invalid $center query type: " + type, false);
+            }
+
+            // Bounding box includes fudge factor.
+            // TODO:  Is this correct, since fudge factor may be spherically transformed?
+            _bBox._min = Point( _startPt._x - _xScanDistance, _startPt._y - _yScanDistance );
+            _bBox._max = Point( _startPt._x + _xScanDistance, _startPt._y + _yScanDistance );
+
+            GEODEBUG( "Bounding box for circle query : " << _bBox.toString() << " (max distance : " << _maxDistance << ")" << " starting from " << _startPt.toString() );
+
+            ok();
+        }
+
+        virtual GeoHash expandStartHash() {
+            return _start;
+        }
+
+        virtual bool fitsInBox( double width ) {
+            return width >= std::max(_xScanDistance, _yScanDistance);
+        }
+
+        virtual double intersectsBox( Box& cur ) {
+            return cur.intersects( _bBox );
+        }
+
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+
+            // Inexact hash distance checks.
+            double error = 0;
+            switch (_type) {
+            case GEO_PLAIN:
+                d = _startPt.distance( p );
+                error = _g->_error;
+                break;
+            case GEO_SPHERE: {
+                checkEarthBounds( p );
+                d = spheredist_deg( _startPt, p );
+                error = _g->_errorSphere;
+                break;
+            }
+            default: assert( false );
+            }
+
+            // If our distance is in the error bounds...
+            if( d >= _maxDistance - error && d <= _maxDistance + error ) return BORDER;
+            return d > _maxDistance ? BAD : GOOD;
+        }
+
+        virtual bool exactDocCheck( const Point& p, double& d ){
+
+            switch (_type) {
+            case GEO_PLAIN: {
+                if( _startPt.distanceWithin( p, _maxDistance ) ) return true;
+                break;
+            }
+            case GEO_SPHERE:
+                checkEarthBounds( p );
+                if( spheredist_deg( _startPt , p ) <= _maxDistance ) return true;
+                break;
+            default: assert( false );
+            }
+
+            return false;
+        }
+
+        GeoDistType _type;
+        GeoHash _start;
+        Point _startPt;
+        double _maxDistance; // user input
+        double _xScanDistance; // effected by GeoDistType
+        double _yScanDistance; // effected by GeoDistType
+        Box _bBox;
+
+    };
+
+    class GeoBoxBrowse : public GeoBrowse {
+    public:
+
+        GeoBoxBrowse( const Geo2dType * g , const BSONObj& box , BSONObj filter = BSONObj(), bool uniqueDocs = true )
+            : GeoBrowse( g , "box" , filter, uniqueDocs ) {
+
+            uassert( 13063 , "$box needs 2 fields (bottomLeft,topRight)" , box.nFields() == 2 );
+
+            // Initialize an *exact* box from the given obj.
+            BSONObjIterator i(box);
+            _want._min = Point( i.next() );
+            _want._max = Point( i.next() );
+
+            _wantRegion = _want;
+            _wantRegion.fudge( g ); // Need to make sure we're checking regions within error bounds of where we want
+            fixBox( g, _wantRegion );
+            fixBox( g, _want );
+
+            uassert( 13064 , "need an area > 0 " , _want.area() > 0 );
+
+            Point center = _want.center();
+            _start = _g->hash( center._x , center._y );
+
+            GEODEBUG( "center : " << center.toString() << "\t" << _prefix );
+
+            _fudge = _g->_error;
+            _wantLen = _fudge +
+                       std::max( ( _want._max._x - _want._min._x ) ,
+                                 ( _want._max._y - _want._min._y ) ) / 2;
+
+            ok();
+        }
+
+        void fixBox( const Geo2dType* g, Box& box ) {
+            if( box._min._x > box._max._x )
+                swap( box._min._x, box._max._x );
+            if( box._min._y > box._max._y )
+                swap( box._min._y, box._max._y );
+
+            double gMin = g->_min;
+            double gMax = g->_max;
+
+            if( box._min._x < gMin ) box._min._x = gMin;
+            if( box._min._y < gMin ) box._min._y = gMin;
+            if( box._max._x > gMax) box._max._x = gMax;
+            if( box._max._y > gMax ) box._max._y = gMax;
+        }
+
+        void swap( double& a, double& b ) {
+            double swap = a;
+            a = b;
+            b = swap;
+        }
+
+        virtual GeoHash expandStartHash() {
+            return _start;
+        }
+
+        virtual bool fitsInBox( double width ) {
+            return width >= _wantLen;
+        }
+
+        virtual double intersectsBox( Box& cur ) {
+            return cur.intersects( _wantRegion );
+        }
+
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+            if( _want.onBoundary( p, _fudge ) ) return BORDER;
+            else return _want.inside( p, _fudge ) ? GOOD : BAD;
+
+        }
+
+        virtual bool exactDocCheck( const Point& p, double& d ){
+            return _want.inside( p );
+        }
+
+        Box _want;
+        Box _wantRegion;
+        double _wantLen;
+        double _fudge;
+
+        GeoHash _start;
+
+    };
+
+    class GeoPolygonBrowse : public GeoBrowse {
+    public:
+
+        GeoPolygonBrowse( const Geo2dType* g , const BSONObj& polyPoints ,
+                          BSONObj filter = BSONObj(), bool uniqueDocs = true ) : GeoBrowse( g , "polygon" , filter, uniqueDocs ) {
+
+            GEODEBUG( "In Polygon" )
+
+            BSONObjIterator i( polyPoints );
+            BSONElement first = i.next();
+            _poly.add( Point( first ) );
+
+            while ( i.more() ) {
+                _poly.add( Point( i.next() ) );
+            }
+
+            uassert( 14030, "polygon must be defined by three points or more", _poly.size() >= 3 );
+
+            _bounds = _poly.bounds();
+            _bounds.fudge( g ); // We need to check regions within the error bounds of these bounds
+            _bounds.truncate( g ); // We don't need to look anywhere outside the space
+
+            _maxDim = _g->_error + _bounds.maxDim() / 2;
+
+            ok();
+        }
+
+        // The initial geo hash box for our first expansion
+        virtual GeoHash expandStartHash() {
+            return _g->hash( _bounds.center() );
+        }
+
+        // Whether the current box width is big enough for our search area
+        virtual bool fitsInBox( double width ) {
+            return _maxDim <= width;
+        }
+
+        // Whether the current box overlaps our search area
+        virtual double intersectsBox( Box& cur ) {
+            return cur.intersects( _bounds );
+        }
+
+        virtual KeyResult approxKeyCheck( const Point& p, double& d ) {
+
+            int in = _poly.contains( p, _g->_error );
+
+            if( in == 0 ) return BORDER;
+            else return in > 0 ? GOOD : BAD;
+
+        }
+
+        virtual bool exactDocCheck( const Point& p, double& d ){
+            return _poly.contains( p );
+        }
+
+    private:
+
+        Polygon _poly;
+        Box _bounds;
+        double _maxDim;
+
+        GeoHash _start;
+    };
+
+    shared_ptr<Cursor> Geo2dType::newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const {
+        if ( numWanted < 0 )
+            numWanted = numWanted * -1;
+        else if ( numWanted == 0 )
+            numWanted = 100;
+
+        BSONObjIterator i(query);
+        while ( i.more() ) {
+            BSONElement e = i.next();
+
+            if ( _geo != e.fieldName() )
+                continue;
+
+            if ( e.type() == Array ) {
+                // If we get an array query, assume it is a location, and do a $within { $center : [[x, y], 0] } search
+                shared_ptr<Cursor> c( new GeoCircleBrowse( this , BSON( "0" << e.embeddedObjectUserCheck() << "1" << 0 ), query.filterFieldsUndotted( BSON( _geo << "" ), false ), "$center", true ) );
+                return c;
+            }
+            else if ( e.type() == Object ) {
+
+                // TODO:  Filter out _geo : { $special... } field so it doesn't get matched accidentally,
+                // if matcher changes
+
+                switch ( e.embeddedObject().firstElement().getGtLtOp() ) {
+                case BSONObj::opNEAR: {
+                    BSONObj n = e.embeddedObject();
+                    e = n.firstElement();
+
+                    const char* suffix = e.fieldName() + 5; // strlen("$near") == 5;
+                    GeoDistType type;
+                    if (suffix[0] == '\0') {
+                        type = GEO_PLAIN;
+                    }
+                    else if (strcmp(suffix, "Sphere") == 0) {
+                        type = GEO_SPHERE;
+                    }
+                    else {
+                        uassert(13464, string("invalid $near search type: ") + e.fieldName(), false);
+                        type = GEO_PLAIN; // prevents uninitialized warning
+                    }
+
+                    double maxDistance = numeric_limits<double>::max();
+                    if ( e.isABSONObj() && e.embeddedObject().nFields() > 2 ) {
+                        BSONObjIterator i(e.embeddedObject());
+                        i.next();
+                        i.next();
+                        BSONElement e = i.next();
+                        if ( e.isNumber() )
+                            maxDistance = e.numberDouble();
+                    }
+                    {
+                        BSONElement e = n["$maxDistance"];
+                        if ( e.isNumber() )
+                            maxDistance = e.numberDouble();
+                    }
+
+                    bool uniqueDocs = false;
+                    if( ! n["$uniqueDocs"].eoo() ) uniqueDocs = n["$uniqueDocs"].trueValue();
+
+                    shared_ptr<GeoSearch> s( new GeoSearch( this , Point( e ) , numWanted , query , maxDistance, type, uniqueDocs ) );
+                    s->exec();
+                    shared_ptr<Cursor> c;
+                    c.reset( new GeoSearchCursor( s ) );
+                    return c;
+                }
+                case BSONObj::opWITHIN: {
+
+                    e = e.embeddedObject().firstElement();
+                    uassert( 13057 , "$within has to take an object or array" , e.isABSONObj() );
+
+                    BSONObj context = e.embeddedObject();
+                    e = e.embeddedObject().firstElement();
+                    string type = e.fieldName();
+
+                    bool uniqueDocs = true;
+                    if( ! context["$uniqueDocs"].eoo() ) uniqueDocs = context["$uniqueDocs"].trueValue();
+
+                    if ( startsWith(type,  "$center") ) {
+                        uassert( 13059 , "$center has to take an object or array" , e.isABSONObj() );
+                        shared_ptr<Cursor> c( new GeoCircleBrowse( this , e.embeddedObjectUserCheck() , query , type, uniqueDocs ) );
+                        return c;
+                    }
+                    else if ( type == "$box" ) {
+                        uassert( 13065 , "$box has to take an object or array" , e.isABSONObj() );
+                        shared_ptr<Cursor> c( new GeoBoxBrowse( this , e.embeddedObjectUserCheck() , query, uniqueDocs ) );
+                        return c;
+                    }
+                    else if ( startsWith( type, "$poly" ) ) {
+                        uassert( 14029 , "$polygon has to take an object or array" , e.isABSONObj() );
+                        shared_ptr<Cursor> c( new GeoPolygonBrowse( this , e.embeddedObjectUserCheck() , query, uniqueDocs ) );
+                        return c;
+                    }
+                    throw UserException( 13058 , str::stream() << "unknown $within information : " << context << ", a shape must be specified." );
+                }
+                default:
+                    // Otherwise... assume the object defines a point, and we want to do a zero-radius $within $center
+                    shared_ptr<Cursor> c( new GeoCircleBrowse( this , BSON( "0" << e.embeddedObjectUserCheck() << "1" << 0 ), query.filterFieldsUndotted( BSON( _geo << "" ), false ) ) );
+                    return c;
+                }
+            }
+        }
+
+        throw UserException( 13042 , (string)"missing geo field (" + _geo + ") in : " + query.toString() );
+    }
+
+    // ------
+    // commands
+    // ------
+
+    class Geo2dFindNearCmd : public Command {
+    public:
+        Geo2dFindNearCmd() : Command( "geoNear" ) {}
+        virtual LockType locktype() const { return READ; }
+        bool slaveOk() const { return true; }
+        void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Geospatial+Indexing#GeospatialIndexing-geoNearCommand"; }
+        bool slaveOverrideOk() { return true; }
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+            NamespaceDetails * d = nsdetails( ns.c_str() );
+            if ( ! d ) {
+                errmsg = "can't find ns";
+                return false;
+            }
+
+            vector<int> idxs;
+            d->findIndexByType( GEO2DNAME , idxs );
+
+            if ( idxs.size() > 1 ) {
+                errmsg = "more than 1 geo indexes :(";
+                return false;
+            }
+
+            if ( idxs.size() == 0 ) {
+                errmsg = "no geo index :(";
+                return false;
+            }
+
+            int geoIdx = idxs[0];
+
+            result.append( "ns" , ns );
+
+            IndexDetails& id = d->idx( geoIdx );
+            Geo2dType * g = (Geo2dType*)id.getSpec().getType();
+            assert( &id == g->getDetails() );
+
+            int numWanted = 100;
+            if ( cmdObj["num"].isNumber() ) {
+                numWanted = cmdObj["num"].numberInt();
+                assert( numWanted >= 0 );
+            }
+
+            bool uniqueDocs = false;
+            if( ! cmdObj["uniqueDocs"].eoo() ) uniqueDocs = cmdObj["uniqueDocs"].trueValue();
+
+            bool includeLocs = false;
+            if( ! cmdObj["includeLocs"].eoo() ) includeLocs = cmdObj["includeLocs"].trueValue();
+
+            uassert(13046, "'near' param missing/invalid", !cmdObj["near"].eoo());
+            const Point n( cmdObj["near"] );
+            result.append( "near" , g->_tohash( cmdObj["near"] ).toString() );
+
+            BSONObj filter;
+            if ( cmdObj["query"].type() == Object )
+                filter = cmdObj["query"].embeddedObject();
+
+            double maxDistance = numeric_limits<double>::max();
+            if ( cmdObj["maxDistance"].isNumber() )
+                maxDistance = cmdObj["maxDistance"].number();
+
+            GeoDistType type = GEO_PLAIN;
+            if ( cmdObj["spherical"].trueValue() )
+                type = GEO_SPHERE;
+
+            GeoSearch gs( g , n , numWanted , filter , maxDistance , type, uniqueDocs, true );
+
+            if ( cmdObj["start"].type() == String) {
+                GeoHash start ((string) cmdObj["start"].valuestr());
+                gs._start = start;
+            }
+
+            gs.exec();
+
+            double distanceMultiplier = 1;
+            if ( cmdObj["distanceMultiplier"].isNumber() )
+                distanceMultiplier = cmdObj["distanceMultiplier"].number();
+
+            double totalDistance = 0;
+
+            BSONObjBuilder arr( result.subarrayStart( "results" ) );
+            int x = 0;
+            for ( GeoHopper::Holder::iterator i=gs._points.begin(); i!=gs._points.end(); i++ ) {
+
+                const GeoPoint& p = *i;
+                double dis = distanceMultiplier * p.distance();
+                totalDistance += dis;
+
+                BSONObjBuilder bb( arr.subobjStart( BSONObjBuilder::numStr( x++ ) ) );
+                bb.append( "dis" , dis );
+                if( includeLocs ){
+                    if( p._pt.couldBeArray() ) bb.append( "loc", BSONArray( p._pt ) );
+                    else bb.append( "loc" , p._pt );
+                }
+                bb.append( "obj" , p._o );
+                bb.done();
+
+                if ( arr.len() > BSONObjMaxUserSize ) {
+                    warning() << "Too many results to fit in single document. Truncating..." << endl;
+                    break;
+                }
+            }
+            arr.done();
+
+            BSONObjBuilder stats( result.subobjStart( "stats" ) );
+            stats.append( "time" , cc().curop()->elapsedMillis() );
+            stats.appendNumber( "btreelocs" , gs._nscanned );
+            stats.appendNumber( "nscanned" , gs._lookedAt );
+            stats.appendNumber( "objectsLoaded" , gs._objectsLoaded );
+            stats.append( "avgDistance" , totalDistance / x );
+            stats.append( "maxDistance" , gs.farthest() );
+            stats.done();
+
+            return true;
+        }
+
+    } geo2dFindNearCmd;
+
+    class GeoWalkCmd : public Command {
+    public:
+        GeoWalkCmd() : Command( "geoWalk" ) {}
+        virtual LockType locktype() const { return READ; }
+        bool slaveOk() const { return true; }
+        bool slaveOverrideOk() { return true; }
+        bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            string ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+            NamespaceDetails * d = nsdetails( ns.c_str() );
+            if ( ! d ) {
+                errmsg = "can't find ns";
+                return false;
+            }
+
+            int geoIdx = -1;
+            {
+                NamespaceDetails::IndexIterator ii = d->ii();
+                while ( ii.more() ) {
+                    IndexDetails& id = ii.next();
+                    if ( id.getSpec().getTypeName() == GEO2DNAME ) {
+                        if ( geoIdx >= 0 ) {
+                            errmsg = "2 geo indexes :(";
+                            return false;
+                        }
+                        geoIdx = ii.pos() - 1;
+                    }
+                }
+            }
+
+            if ( geoIdx < 0 ) {
+                errmsg = "no geo index :(";
+                return false;
+            }
+
+
+            IndexDetails& id = d->idx( geoIdx );
+            Geo2dType * g = (Geo2dType*)id.getSpec().getType();
+            assert( &id == g->getDetails() );
+
+            int max = 100000;
+
+            auto_ptr<BtreeCursor> bc( BtreeCursor::make( d , geoIdx , id , BSONObj() , BSONObj() , true , 1 ) );
+            BtreeCursor &c = *bc;
+            while ( c.ok() && max-- ) {
+                GeoHash h( c.currKey().firstElement() );
+                int len;
+                cout << "\t" << h.toString()
+                     << "\t" << c.current()[g->_geo]
+                     << "\t" << hex << h.getHash()
+                     << "\t" << hex << ((long long*)c.currKey().firstElement().binData(len))[0]
+                     << "\t" << c.current()["_id"]
+                     << endl;
+                c.advance();
+            }
+
+            return true;
+        }
+
+    } geoWalkCmd;
+
+    struct GeoUnitTest : public UnitTest {
+
+        int round( double d ) {
+            return (int)(.5+(d*1000));
+        }
+
+#define GEOHEQ(a,b) if ( a.toString() != b ){ cout << "[" << a.toString() << "] != [" << b << "]" << endl; assert( a == GeoHash(b) ); }
+
+        void run() {
+            assert( ! GeoHash::isBitSet( 0 , 0 ) );
+            assert( ! GeoHash::isBitSet( 0 , 31 ) );
+            assert( GeoHash::isBitSet( 1 , 31 ) );
+
+            IndexSpec i( BSON( "loc" << "2d" ) );
+            Geo2dType g( &geo2dplugin , &i );
+            {
+                double x = 73.01212;
+                double y = 41.352964;
+                BSONObj in = BSON( "x" << x << "y" << y );
+                GeoHash h = g._hash( in );
+                BSONObj out = g._unhash( h );
+                assert( round(x) == round( out["x"].number() ) );
+                assert( round(y) == round( out["y"].number() ) );
+                assert( round( in["x"].number() ) == round( out["x"].number() ) );
+                assert( round( in["y"].number() ) == round( out["y"].number() ) );
+            }
+
+            {
+                double x = -73.01212;
+                double y = 41.352964;
+                BSONObj in = BSON( "x" << x << "y" << y );
+                GeoHash h = g._hash( in );
+                BSONObj out = g._unhash( h );
+                assert( round(x) == round( out["x"].number() ) );
+                assert( round(y) == round( out["y"].number() ) );
+                assert( round( in["x"].number() ) == round( out["x"].number() ) );
+                assert( round( in["y"].number() ) == round( out["y"].number() ) );
+            }
+
+            {
+                GeoHash h( "0000" );
+                h.move( 0 , 1 );
+                GEOHEQ( h , "0001" );
+                h.move( 0 , -1 );
+                GEOHEQ( h , "0000" );
+
+                h.init( "0001" );
+                h.move( 0 , 1 );
+                GEOHEQ( h , "0100" );
+                h.move( 0 , -1 );
+                GEOHEQ( h , "0001" );
+
+
+                h.init( "0000" );
+                h.move( 1 , 0 );
+                GEOHEQ( h , "0010" );
+            }
+
+            {
+                Box b( 5 , 5 , 2 );
+                assert( "(5,5) -->> (7,7)" == b.toString() );
+            }
+
+            {
+                GeoHash a = g.hash( 1 , 1 );
+                GeoHash b = g.hash( 4 , 5 );
+                assert( 5 == (int)(g.distance( a , b ) ) );
+                a = g.hash( 50 , 50 );
+                b = g.hash( 42 , 44 );
+                assert( round(10) == round(g.distance( a , b )) );
+            }
+
+            {
+                GeoHash x("0000");
+                assert( 0 == x.getHash() );
+                x.init( 0 , 1 , 32 );
+                GEOHEQ( x , "0000000000000000000000000000000000000000000000000000000000000001" )
+
+                assert( GeoHash( "1100").hasPrefix( GeoHash( "11" ) ) );
+                assert( ! GeoHash( "1000").hasPrefix( GeoHash( "11" ) ) );
+            }
+
+            {
+                GeoHash x("1010");
+                GEOHEQ( x , "1010" );
+                GeoHash y = x + "01";
+                GEOHEQ( y , "101001" );
+            }
+
+            {
+
+                GeoHash a = g.hash( 5 , 5 );
+                GeoHash b = g.hash( 5 , 7 );
+                GeoHash c = g.hash( 100 , 100 );
+                /*
+                cout << "a: " << a << endl;
+                cout << "b: " << b << endl;
+                cout << "c: " << c << endl;
+
+                cout << "a: " << a.toStringHex1() << endl;
+                cout << "b: " << b.toStringHex1() << endl;
+                cout << "c: " << c.toStringHex1() << endl;
+                */
+                BSONObj oa = a.wrap();
+                BSONObj ob = b.wrap();
+                BSONObj oc = c.wrap();
+                /*
+                cout << "a: " << oa.hexDump() << endl;
+                cout << "b: " << ob.hexDump() << endl;
+                cout << "c: " << oc.hexDump() << endl;
+                */
+                assert( oa.woCompare( ob ) < 0 );
+                assert( oa.woCompare( oc ) < 0 );
+
+            }
+
+            {
+                GeoHash x( "000000" );
+                x.move( -1 , 0 );
+                GEOHEQ( x , "101010" );
+                x.move( 1 , -1 );
+                GEOHEQ( x , "010101" );
+                x.move( 0 , 1 );
+                GEOHEQ( x , "000000" );
+            }
+
+            {
+                GeoHash prefix( "110011000000" );
+                GeoHash entry(  "1100110000011100000111000001110000011100000111000001000000000000" );
+                assert( ! entry.hasPrefix( prefix ) );
+
+                entry = GeoHash("1100110000001100000111000001110000011100000111000001000000000000");
+                assert( entry.toString().find( prefix.toString() ) == 0 );
+                assert( entry.hasPrefix( GeoHash( "1100" ) ) );
+                assert( entry.hasPrefix( prefix ) );
+            }
+
+            {
+                GeoHash a = g.hash( 50 , 50 );
+                GeoHash b = g.hash( 48 , 54 );
+                assert( round( 4.47214 ) == round( g.distance( a , b ) ) );
+            }
+
+
+            {
+                Box b( Point( 29.762283 , -95.364271 ) , Point( 29.764283000000002 , -95.36227099999999 ) );
+                assert( b.inside( 29.763 , -95.363 ) );
+                assert( ! b.inside( 32.9570255 , -96.1082497 ) );
+                assert( ! b.inside( 32.9570255 , -96.1082497 , .01 ) );
+            }
+
+            {
+                GeoHash a( "11001111" );
+                assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11") ) );
+                assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11110000") ) );
+            }
+
+            {
+                int N = 10000;
+                {
+                    Timer t;
+                    for ( int i=0; i<N; i++ ) {
+                        unsigned x = (unsigned)rand();
+                        unsigned y = (unsigned)rand();
+                        GeoHash h( x , y );
+                        unsigned a,b;
+                        h.unhash_slow( a,b );
+                        assert( a == x );
+                        assert( b == y );
+                    }
+                    //cout << "slow: " << t.millis() << endl;
+                }
+
+                {
+                    Timer t;
+                    for ( int i=0; i<N; i++ ) {
+                        unsigned x = (unsigned)rand();
+                        unsigned y = (unsigned)rand();
+                        GeoHash h( x , y );
+                        unsigned a,b;
+                        h.unhash_fast( a,b );
+                        assert( a == x );
+                        assert( b == y );
+                    }
+                    //cout << "fast: " << t.millis() << endl;
+                }
+
+            }
+
+            {
+                // see http://en.wikipedia.org/wiki/Great-circle_distance#Worked_example
+
+                {
+                    Point BNA (-86.67, 36.12);
+                    Point LAX (-118.40, 33.94);
+
+                    double dist1 = spheredist_deg(BNA, LAX);
+                    double dist2 = spheredist_deg(LAX, BNA);
+
+                    // target is 0.45306
+                    assert( 0.45305 <= dist1 && dist1 <= 0.45307 );
+                    assert( 0.45305 <= dist2 && dist2 <= 0.45307 );
+                }
+                {
+                    Point BNA (-1.5127, 0.6304);
+                    Point LAX (-2.0665, 0.5924);
+
+                    double dist1 = spheredist_rad(BNA, LAX);
+                    double dist2 = spheredist_rad(LAX, BNA);
+
+                    // target is 0.45306
+                    assert( 0.45305 <= dist1 && dist1 <= 0.45307 );
+                    assert( 0.45305 <= dist2 && dist2 <= 0.45307 );
+                }
+                {
+                    Point JFK (-73.77694444, 40.63861111 );
+                    Point LAX (-118.40, 33.94);
+
+                    double dist = spheredist_deg(JFK, LAX) * EARTH_RADIUS_MILES;
+                    assert( dist > 2469 && dist < 2470 );
+                }
+
+                {
+                    Point BNA (-86.67, 36.12);
+                    Point LAX (-118.40, 33.94);
+                    Point JFK (-73.77694444, 40.63861111 );
+                    assert( spheredist_deg(BNA, BNA) < 1e-6);
+                    assert( spheredist_deg(LAX, LAX) < 1e-6);
+                    assert( spheredist_deg(JFK, JFK) < 1e-6);
+
+                    Point zero (0, 0);
+                    Point antizero (0,-180);
+
+                    // these were known to cause NaN
+                    assert( spheredist_deg(zero, zero) < 1e-6);
+                    assert( fabs(M_PI-spheredist_deg(zero, antizero)) < 1e-6);
+                    assert( fabs(M_PI-spheredist_deg(antizero, zero)) < 1e-6);
+                }
+            }
+        }
+    } geoUnitTest;
+
+
+}
+
diff --git a/src/mongo/db/geo/core.h b/src/mongo/db/geo/core.h
new file mode 100644
index 00000000000..c49131e0162
--- /dev/null
+++ b/src/mongo/db/geo/core.h
@@ -0,0 +1,550 @@
+// core.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../pch.h"
+#include "../jsobj.h"
+
+#include <cmath>
+
+#ifndef M_PI
+#  define M_PI 3.14159265358979323846
+#endif
+
+namespace mongo {
+
+    class GeoBitSets {
+    public:
+        GeoBitSets() {
+            for ( int i=0; i<32; i++ ) {
+                masks32[i] = ( 1 << ( 31 - i ) );
+            }
+            for ( int i=0; i<64; i++ ) {
+                masks64[i] = ( 1LL << ( 63 - i ) );
+            }
+
+            for ( unsigned i=0; i<16; i++ ) {
+                unsigned fixed = 0;
+                for ( int j=0; j<4; j++ ) {
+                    if ( i & ( 1 << j ) )
+                        fixed |= ( 1 << ( j * 2 ) );
+                }
+                hashedToNormal[fixed] = i;
+            }
+
+            long long currAllX = 0, currAllY = 0;
+            for ( int i = 0; i < 64; i++ ){
+                if( i % 2 == 0 ){
+                    allX[ i / 2 ] = currAllX;
+                    currAllX = currAllX + ( 1LL << ( 63 - i ) );
+                }
+                else{
+                    allY[ i / 2 ] = currAllY;
+                    currAllY = currAllY + ( 1LL << ( 63 - i ) );
+                }
+            }
+        }
+        int masks32[32];
+        long long masks64[64];
+        long long allX[32];
+        long long allY[32];
+
+        unsigned hashedToNormal[256];
+    };
+
+    extern GeoBitSets geoBitSets;
+
+    class GeoHash {
+    public:
+
+        GeoHash()
+            : _hash(0),_bits(0) {
+        }
+
+        explicit GeoHash( const char * hash ) {
+            init( hash );
+        }
+
+        explicit GeoHash( const string& hash ) {
+            init( hash );
+        }
+
+        static GeoHash makeFromBinData(const char *bindata, unsigned bits) {
+            GeoHash h;
+            h._bits = bits;
+            h._copy( (char*)&h._hash , bindata );
+            h._fix();
+            return h;
+        }
+
+        explicit GeoHash( const BSONElement& e , unsigned bits=32 ) {
+            _bits = bits;
+            if ( e.type() == BinData ) {
+                int len = 0;
+                _copy( (char*)&_hash , e.binData( len ) );
+                assert( len == 8 );
+                _bits = bits;
+            }
+            else {
+                cout << "GeoHash bad element: " << e << endl;
+                uassert(13047,"wrong type for geo index. if you're using a pre-release version, need to rebuild index",0);
+            }
+            _fix();
+        }
+
+        GeoHash( unsigned x , unsigned y , unsigned bits=32) {
+            init( x , y , bits );
+        }
+
+        GeoHash( const GeoHash& old ) {
+            _hash = old._hash;
+            _bits = old._bits;
+        }
+
+        GeoHash( long long hash , unsigned bits )
+            : _hash( hash ) , _bits( bits ) {
+            _fix();
+        }
+
+        void init( unsigned x , unsigned y , unsigned bits ) {
+            assert( bits <= 32 );
+            _hash = 0;
+            _bits = bits;
+            for ( unsigned i=0; i<bits; i++ ) {
+                if ( isBitSet( x , i ) ) _hash |= geoBitSets.masks64[i*2];
+                if ( isBitSet( y , i ) ) _hash |= geoBitSets.masks64[(i*2)+1];
+            }
+        }
+
+        void unhash_fast( unsigned& x , unsigned& y ) const {
+            x = 0;
+            y = 0;
+            char * c = (char*)(&_hash);
+            for ( int i=0; i<8; i++ ) {
+                unsigned t = (unsigned)(c[i]) & 0x55;
+                y |= ( geoBitSets.hashedToNormal[t] << (4*(i)) );
+
+                t = ( (unsigned)(c[i]) >> 1 ) & 0x55;
+                x |= ( geoBitSets.hashedToNormal[t] << (4*(i)) );
+            }
+        }
+
+        void unhash_slow( unsigned& x , unsigned& y ) const {
+            x = 0;
+            y = 0;
+            for ( unsigned i=0; i<_bits; i++ ) {
+                if ( getBitX(i) )
+                    x |= geoBitSets.masks32[i];
+                if ( getBitY(i) )
+                    y |= geoBitSets.masks32[i];
+            }
+        }
+
+        void unhash( unsigned& x , unsigned& y ) const {
+            unhash_fast( x , y );
+        }
+
+        /**
+         * @param 0 = high
+         */
+        static bool isBitSet( unsigned val , unsigned  bit ) {
+            return geoBitSets.masks32[bit] & val;
+        }
+
+        GeoHash up() const {
+            return GeoHash( _hash , _bits - 1 );
+        }
+
+        bool hasPrefix( const GeoHash& other ) const {
+            assert( other._bits <= _bits );
+            if ( other._bits == 0 )
+                return true;
+            long long x = other._hash ^ _hash;
+            x = x >> (64-(other._bits*2));
+            return x == 0;
+        }
+
+
+        string toString() const {
+            StringBuilder buf( _bits * 2 );
+            for ( unsigned x=0; x<_bits*2; x++ )
+                buf.append( _hash & geoBitSets.masks64[x] ? "1" : "0" );
+            return buf.str();
+        }
+
+        string toStringHex1() const {
+            stringstream ss;
+            ss << hex << _hash;
+            return ss.str();
+        }
+
+        void init( const string& s ) {
+            _hash = 0;
+            _bits = s.size() / 2;
+            for ( unsigned pos=0; pos<s.size(); pos++ )
+                if ( s[pos] == '1' )
+                    setBit( pos , 1 );
+        }
+
+        void setBit( unsigned pos , bool one ) {
+            assert( pos < _bits * 2 );
+            if ( one )
+                _hash |= geoBitSets.masks64[pos];
+            else if ( _hash & geoBitSets.masks64[pos] )
+                _hash &= ~geoBitSets.masks64[pos];
+        }
+
+        bool getBit( unsigned pos ) const {
+            return _hash & geoBitSets.masks64[pos];
+        }
+
+        bool getBitX( unsigned pos ) const {
+            assert( pos < 32 );
+            return getBit( pos * 2 );
+        }
+
+        bool getBitY( unsigned pos ) const {
+            assert( pos < 32 );
+            return getBit( ( pos * 2 ) + 1 );
+        }
+
+        BSONObj wrap( const char* name = "" ) const {
+            BSONObjBuilder b(20);
+            append( b , name );
+            BSONObj o = b.obj();
+            if( ! strlen( name ) ) assert( o.objsize() == 20 );
+            return o;
+        }
+
+        bool constrains() const {
+            return _bits > 0;
+        }
+
+        bool canRefine() const {
+           return _bits < 32;
+        }
+
+        bool atMinX() const {
+            return ( _hash & geoBitSets.allX[ _bits ] ) == 0;
+        }
+
+        bool atMinY() const {
+            //log() << " MinY : " << hex << (unsigned long long) _hash << " " << _bits << " " << hex << (unsigned long long) geoBitSets.allY[ _bits ] << endl;
+            return ( _hash & geoBitSets.allY[ _bits ] ) == 0;
+        }
+
+        bool atMaxX() const {
+            return ( _hash & geoBitSets.allX[ _bits ] ) == geoBitSets.allX[ _bits ];
+        }
+
+        bool atMaxY() const {
+            return ( _hash & geoBitSets.allY[ _bits ] ) == geoBitSets.allY[ _bits ];
+        }
+
+        void move( int x , int y ) {
+            assert( _bits );
+            _move( 0 , x );
+            _move( 1 , y );
+        }
+
+        void _move( unsigned offset , int d ) {
+            if ( d == 0 )
+                return;
+            assert( d <= 1 && d>= -1 ); // TEMP
+
+            bool from, to;
+            if ( d > 0 ) {
+                from = 0;
+                to = 1;
+            }
+            else {
+                from = 1;
+                to = 0;
+            }
+
+            unsigned pos = ( _bits * 2 ) - 1;
+            if ( offset == 0 )
+                pos--;
+            while ( true ) {
+                if ( getBit(pos) == from ) {
+                    setBit( pos , to );
+                    return;
+                }
+
+                if ( pos < 2 ) {
+                    // overflow
+                    for ( ; pos < ( _bits * 2 ) ; pos += 2 ) {
+                        setBit( pos , from );
+                    }
+                    return;
+                }
+
+                setBit( pos , from );
+                pos -= 2;
+            }
+
+            assert(0);
+        }
+
+        GeoHash& operator=(const GeoHash& h) {
+            _hash = h._hash;
+            _bits = h._bits;
+            return *this;
+        }
+
+        bool operator==(const GeoHash& h ) const {
+            return _hash == h._hash && _bits == h._bits;
+        }
+
+        bool operator!=(const GeoHash& h ) const {
+            return !( *this == h );
+        }
+
+        bool operator<(const GeoHash& h ) const {
+            if( _hash != h._hash ) return _hash < h._hash;
+            return _bits < h._bits;
+        }
+
+        GeoHash& operator+=( const char * s ) {
+            unsigned pos = _bits * 2;
+            _bits += strlen(s) / 2;
+            assert( _bits <= 32 );
+            while ( s[0] ) {
+                if ( s[0] == '1' )
+                    setBit( pos , 1 );
+                pos++;
+                s++;
+            }
+
+            return *this;
+        }
+
+        GeoHash operator+( const char * s ) const {
+            GeoHash n = *this;
+            n+=s;
+            return n;
+        }
+
+        GeoHash operator+( string s ) const {
+           return operator+( s.c_str() );
+        }
+
+        void _fix() {
+            static long long FULL = 0xFFFFFFFFFFFFFFFFLL;
+            long long mask = FULL << ( 64 - ( _bits * 2 ) );
+            _hash &= mask;
+        }
+
+        void append( BSONObjBuilder& b , const char * name ) const {
+            char buf[8];
+            _copy( buf , (char*)&_hash );
+            b.appendBinData( name , 8 , bdtCustom , buf );
+        }
+
+        long long getHash() const {
+            return _hash;
+        }
+
+        unsigned getBits() const {
+            return _bits;
+        }
+
+        GeoHash commonPrefix( const GeoHash& other ) const {
+            unsigned i=0;
+            for ( ; i<_bits && i<other._bits; i++ ) {
+                if ( getBitX( i ) == other.getBitX( i ) &&
+                        getBitY( i ) == other.getBitY( i ) )
+                    continue;
+                break;
+            }
+            return GeoHash(_hash,i);
+        }
+
+    private:
+
+        static void _copy( char * dst , const char * src ) {
+            for ( unsigned a=0; a<8; a++ ) {
+                dst[a] = src[7-a];
+            }
+        }
+
+        long long _hash;
+        unsigned _bits; // bits per field, so 1 to 32
+    };
+
+    inline ostream& operator<<( ostream &s, const GeoHash &h ) {
+        s << h.toString();
+        return s;
+    }
+
+    class GeoConvert {
+    public:
+        virtual ~GeoConvert() {}
+
+        virtual void unhash( const GeoHash& h , double& x , double& y ) const = 0;
+        virtual GeoHash hash( double x , double y ) const = 0;
+    };
+
+    class Point {
+    public:
+
+        Point( const GeoConvert * g , const GeoHash& hash ) {
+            g->unhash( hash , _x , _y );
+        }
+
+        explicit Point( const BSONElement& e ) {
+            BSONObjIterator i(e.Obj());
+            _x = i.next().number();
+            _y = i.next().number();
+        }
+
+        explicit Point( const BSONObj& o ) {
+            BSONObjIterator i(o);
+            _x = i.next().number();
+            _y = i.next().number();
+        }
+
+        Point( double x , double y )
+            : _x( x ) , _y( y ) {
+        }
+
+        Point() : _x(0),_y(0) {
+        }
+
+        GeoHash hash( const GeoConvert * g ) {
+            return g->hash( _x , _y );
+        }
+
+        double distance( const Point& p ) const {
+            double a = _x - p._x;
+            double b = _y - p._y;
+
+            // Avoid numerical error if possible...
+            if( a == 0 ) return abs( _y - p._y );
+            if( b == 0 ) return abs( _x - p._x );
+
+            return sqrt( ( a * a ) + ( b * b ) );
+        }
+
+        /**
+         * Distance method that compares x or y coords when other direction is zero,
+         * avoids numerical error when distances are very close to radius but axis-aligned.
+         *
+         * An example of the problem is:
+         * (52.0 - 51.9999) - 0.0001 = 3.31965e-15 and 52.0 - 51.9999 > 0.0001 in double arithmetic
+         * but:
+         * 51.9999 + 0.0001 <= 52.0
+         *
+         * This avoids some (but not all!) suprising results in $center queries where points are
+         * ( radius + center.x, center.y ) or vice-versa.
+         */
+        bool distanceWithin( const Point& p, double radius ) const {
+            double a = _x - p._x;
+            double b = _y - p._y;
+
+            if( a == 0 ) {
+                //
+                // Note:  For some, unknown reason, when a 32-bit g++ optimizes this call, the sum is
+                // calculated imprecisely.  We need to force the compiler to always evaluate it correctly,
+                // hence the weirdness.
+                //
+                // On some 32-bit linux machines, removing the volatile keyword or calculating the sum inline
+                // will make certain geo tests fail.  Of course this check will force volatile for all 32-bit systems,
+                // not just affected systems.
+                if( sizeof(void*) <= 4 ){
+                    volatile double sum = _y > p._y ? p._y + radius : _y + radius;
+                    return _y > p._y ? sum >= _y : sum >= p._y;
+                }
+                else {
+                    // Original math, correct for most systems
+                    return _y > p._y ? p._y + radius >= _y : _y + radius >= p._y;
+                }
+            }
+            if( b == 0 ) {
+                if( sizeof(void*) <= 4 ){
+                    volatile double sum = _x > p._x ? p._x + radius : _x + radius;
+                    return _x > p._x ? sum >= _x : sum >= p._x;
+                }
+                else {
+                    return _x > p._x ? p._x + radius >= _x : _x + radius >= p._x;
+                }
+            }
+
+            return sqrt( ( a * a ) + ( b * b ) ) <= radius;
+        }
+
+        string toString() const {
+            StringBuilder buf(32);
+            buf << "(" << _x << "," << _y << ")";
+            return buf.str();
+
+        }
+
+        double _x;
+        double _y;
+    };
+
+
+    extern const double EARTH_RADIUS_KM;
+    extern const double EARTH_RADIUS_MILES;
+
+    // Technically lat/long bounds, not really tied to earth radius.
+    inline void checkEarthBounds( Point p ) {
+        uassert( 14808, str::stream() << "point " << p.toString() << " must be in earth-like bounds of long : [-180, 180), lat : [-90, 90] ",
+                 p._x >= -180 && p._x < 180 && p._y >= -90 && p._y <= 90 );
+    }
+
+    inline double deg2rad(double deg) { return deg * (M_PI/180); }
+    inline double rad2deg(double rad) { return rad * (180/M_PI); }
+
+    // WARNING: _x and _y MUST be longitude and latitude in that order
+    // note: multiply by earth radius for distance
+    inline double spheredist_rad( const Point& p1, const Point& p2 ) {
+        // this uses the n-vector formula: http://en.wikipedia.org/wiki/N-vector
+        // If you try to match the code to the formula, note that I inline the cross-product.
+        // TODO: optimize with SSE
+
+        double sin_x1(sin(p1._x)), cos_x1(cos(p1._x));
+        double sin_y1(sin(p1._y)), cos_y1(cos(p1._y));
+        double sin_x2(sin(p2._x)), cos_x2(cos(p2._x));
+        double sin_y2(sin(p2._y)), cos_y2(cos(p2._y));
+
+        double cross_prod =
+            (cos_y1*cos_x1 * cos_y2*cos_x2) +
+            (cos_y1*sin_x1 * cos_y2*sin_x2) +
+            (sin_y1        * sin_y2);
+
+        if (cross_prod >= 1 || cross_prod <= -1) {
+            // fun with floats
+            assert( fabs(cross_prod)-1 < 1e-6 );
+            return cross_prod > 0 ? 0 : M_PI;
+        }
+
+        return acos(cross_prod);
+    }
+
+    // note: return is still in radians as that can be multiplied by radius to get arc length
+    inline double spheredist_deg( const Point& p1, const Point& p2 ) {
+        return spheredist_rad(
+                   Point( deg2rad(p1._x), deg2rad(p1._y) ),
+                   Point( deg2rad(p2._x), deg2rad(p2._y) )
+               );
+    }
+
+}
diff --git a/src/mongo/db/geo/haystack.cpp b/src/mongo/db/geo/haystack.cpp
new file mode 100644
index 00000000000..104665087f6
--- /dev/null
+++ b/src/mongo/db/geo/haystack.cpp
@@ -0,0 +1,318 @@
+// db/geo/haystack.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "../namespace-inl.h"
+#include "../jsobj.h"
+#include "../index.h"
+#include "../../util/unittest.h"
+#include "../commands.h"
+#include "../pdfile.h"
+#include "../btree.h"
+#include "../curop-inl.h"
+#include "../matcher.h"
+#include "core.h"
+#include "../../util/timer.h"
+
+#define GEOQUADDEBUG(x)
+//#define GEOQUADDEBUG(x) cout << x << endl
+
+/**
+ * this is a geo based search piece, which is different than regular geo lookup
+ * this is useful when you want to look for something within a region where the ratio is low
+ * works well for search for restaurants withing 25 miles with a certain name
+ * should not be used for finding the closest restaurants that are open
+ */
+namespace mongo {
+
+    string GEOSEARCHNAME = "geoHaystack";
+
+    class GeoHaystackSearchHopper {
+    public:
+        GeoHaystackSearchHopper( const BSONObj& n , double maxDistance , unsigned limit , const string& geoField )
+            : _near( n ) , _maxDistance( maxDistance ) , _limit( limit ) , _geoField(geoField) {
+
+        }
+
+        void got( const DiskLoc& loc ) {
+            Point p( loc.obj().getFieldDotted( _geoField ) );
+            if ( _near.distance( p ) > _maxDistance )
+                return;
+            _locs.push_back( loc );
+        }
+
+        int append( BSONArrayBuilder& b ) {
+            for ( unsigned i=0; i<_locs.size() && i<_limit; i++ )
+                b.append( _locs[i].obj() );
+            return _locs.size();
+        }
+
+        Point _near;
+        double _maxDistance;
+        unsigned _limit;
+        string _geoField;
+
+        vector<DiskLoc> _locs;
+    };
+
+    class GeoHaystackSearchIndex : public IndexType {
+
+    public:
+
+        GeoHaystackSearchIndex( const IndexPlugin* plugin , const IndexSpec* spec )
+            : IndexType( plugin , spec ) {
+
+            BSONElement e = spec->info["bucketSize"];
+            uassert( 13321 , "need bucketSize" , e.isNumber() );
+            _bucketSize = e.numberDouble();
+
+            BSONObjBuilder orderBuilder;
+
+            BSONObjIterator i( spec->keyPattern );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                if ( e.type() == String && GEOSEARCHNAME == e.valuestr() ) {
+                    uassert( 13314 , "can't have 2 geo fields" , _geo.size() == 0 );
+                    uassert( 13315 , "2d has to be first in index" , _other.size() == 0 );
+                    _geo = e.fieldName();
+                }
+                else {
+                    _other.push_back( e.fieldName() );
+                }
+                orderBuilder.append( "" , 1 );
+            }
+
+            uassert( 13316 , "no geo field specified" , _geo.size() );
+            uassert( 13317 , "no other fields specified" , _other.size() );
+            uassert( 13326 , "quadrant search can only have 1 other field for now" , _other.size() == 1 );
+            _order = orderBuilder.obj();
+        }
+
+        int hash( const BSONElement& e ) const {
+            uassert( 13322 , "not a number" , e.isNumber() );
+            return hash( e.numberDouble() );
+        }
+
+        int hash( double d ) const {
+            d += 180;
+            d /= _bucketSize;
+            return (int)d;
+        }
+
+        string makeString( int hashedX , int hashedY ) const {
+            stringstream ss;
+            ss << hashedX << "_" << hashedY;
+            return ss.str();
+        }
+
+        void _add( const BSONObj& obj, const string& root , const BSONElement& e , BSONObjSet& keys ) const {
+            BSONObjBuilder buf;
+            buf.append( "" , root );
+            if ( e.eoo() )
+                buf.appendNull( "" );
+            else
+                buf.appendAs( e , "" );
+
+            BSONObj key = buf.obj();
+            GEOQUADDEBUG( obj << "\n\t" << root << "\n\t" << key );
+            keys.insert( key );
+        }
+
+        void getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+
+            BSONElement loc = obj.getFieldDotted( _geo );
+            if ( loc.eoo() )
+                return;
+
+            uassert( 13323 , "latlng not an array" , loc.isABSONObj() );
+            string root;
+            {
+                BSONObjIterator i( loc.Obj() );
+                BSONElement x = i.next();
+                BSONElement y = i.next();
+                root = makeString( hash(x) , hash(y) );
+            }
+
+
+            assert( _other.size() == 1 );
+
+            BSONElementSet all;
+            obj.getFieldsDotted( _other[0] , all );
+
+            if ( all.size() == 0 ) {
+                _add( obj , root , BSONElement() , keys );
+            }
+            else {
+                for ( BSONElementSet::iterator i=all.begin(); i!=all.end(); ++i ) {
+                    _add( obj , root , *i , keys );
+                }
+            }
+
+        }
+
+        shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const {
+            shared_ptr<Cursor> c;
+            assert(0);
+            return c;
+        }
+
+        void searchCommand( NamespaceDetails* nsd , int idxNo ,
+                            const BSONObj& n /*near*/ , double maxDistance , const BSONObj& search ,
+                            BSONObjBuilder& result , unsigned limit ) {
+
+            Timer t;
+
+            log(1) << "SEARCH near:" << n << " maxDistance:" << maxDistance << " search: " << search << endl;
+            int x,y;
+            {
+                BSONObjIterator i( n );
+                x = hash( i.next() );
+                y = hash( i.next() );
+            }
+            int scale = (int)ceil( maxDistance / _bucketSize );
+
+            GeoHaystackSearchHopper hopper(n,maxDistance,limit,_geo);
+
+            long long btreeMatches = 0;
+
+            for ( int a=-scale; a<=scale; a++ ) {
+                for ( int b=-scale; b<=scale; b++ ) {
+
+                    BSONObjBuilder bb;
+                    bb.append( "" , makeString( x + a , y + b ) );
+                    for ( unsigned i=0; i<_other.size(); i++ ) {
+                        BSONElement e = search.getFieldDotted( _other[i] );
+                        if ( e.eoo() )
+                            bb.appendNull( "" );
+                        else
+                            bb.appendAs( e , "" );
+                    }
+
+                    BSONObj key = bb.obj();
+
+                    GEOQUADDEBUG( "KEY: " << key );
+
+                    set<DiskLoc> thisPass;
+                    scoped_ptr<BtreeCursor> cursor( BtreeCursor::make( nsd , idxNo , *getDetails() , key , key , true , 1 ) );
+                    while ( cursor->ok() ) {
+                        pair<set<DiskLoc>::iterator, bool> p = thisPass.insert( cursor->currLoc() );
+                        if ( p.second ) {
+                            hopper.got( cursor->currLoc() );
+                            GEOQUADDEBUG( "\t" << cursor->current() );
+                            btreeMatches++;
+                        }
+                        cursor->advance();
+                    }
+                }
+
+            }
+
+            BSONArrayBuilder arr( result.subarrayStart( "results" ) );
+            int num = hopper.append( arr );
+            arr.done();
+
+            {
+                BSONObjBuilder b( result.subobjStart( "stats" ) );
+                b.append( "time" , t.millis() );
+                b.appendNumber( "btreeMatches" , btreeMatches );
+                b.append( "n" , num );
+                b.done();
+            }
+        }
+
+        const IndexDetails* getDetails() const {
+            return _spec->getDetails();
+        }
+
+        string _geo;
+        vector<string> _other;
+
+        BSONObj _order;
+
+        double _bucketSize;
+    };
+
+    class GeoHaystackSearchIndexPlugin : public IndexPlugin {
+    public:
+        GeoHaystackSearchIndexPlugin() : IndexPlugin( GEOSEARCHNAME ) {
+        }
+
+        virtual IndexType* generate( const IndexSpec* spec ) const {
+            return new GeoHaystackSearchIndex( this , spec );
+        }
+
+    } nameIndexPlugin;
+
+
+    class GeoHaystackSearchCommand : public Command {
+    public:
+        GeoHaystackSearchCommand() : Command( "geoSearch" ) {}
+        virtual LockType locktype() const { return READ; }
+        bool slaveOk() const { return true; }
+        bool slaveOverrideOk() const { return true; }
+        bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+
+            string ns = dbname + "." + cmdObj.firstElement().valuestr();
+
+            NamespaceDetails * d = nsdetails( ns.c_str() );
+            if ( ! d ) {
+                errmsg = "can't find ns";
+                return false;
+            }
+
+            vector<int> idxs;
+            d->findIndexByType( GEOSEARCHNAME , idxs );
+            if ( idxs.size() == 0 ) {
+                errmsg = "no geoSearch index";
+                return false;
+            }
+            if ( idxs.size() > 1 ) {
+                errmsg = "more than 1 geosearch index";
+                return false;
+            }
+
+            int idxNum = idxs[0];
+
+            IndexDetails& id = d->idx( idxNum );
+            GeoHaystackSearchIndex * si = (GeoHaystackSearchIndex*)id.getSpec().getType();
+            assert( &id == si->getDetails() );
+
+            BSONElement n = cmdObj["near"];
+            BSONElement maxDistance = cmdObj["maxDistance"];
+            BSONElement search = cmdObj["search"];
+
+            uassert( 13318 , "near needs to be an array" , n.isABSONObj() );
+            uassert( 13319 , "maxDistance needs a number" , maxDistance.isNumber() );
+            uassert( 13320 , "search needs to be an object" , search.type() == Object );
+
+            unsigned limit = 50;
+            if ( cmdObj["limit"].isNumber() )
+                limit = (unsigned)cmdObj["limit"].numberInt();
+
+            si->searchCommand( d , idxNum , n.Obj() , maxDistance.numberDouble() , search.Obj() , result , limit );
+
+            return 1;
+        }
+
+    } nameSearchCommand;
+
+
+
+
+
+}
diff --git a/src/mongo/db/globals.h b/src/mongo/db/globals.h
new file mode 100644
index 00000000000..093bec76a0e
--- /dev/null
+++ b/src/mongo/db/globals.h
@@ -0,0 +1,54 @@
+// @file globals.h
+// grouping of global variables to make concurrency work clearer
+
+#pragma once
+
+namespace mongo {
+
+    void assertStartingUp();
+
+    // this is prototype for now, we'll see if it is helpful
+
+    /** "value is Const After Server Init" helper
+    *
+    * Example:
+    *
+    *  casi<int> foo = 3;
+    *  foo.ref() = 4; // asserts if not still in server init
+    *  int x = foo+1; // ok anytime
+    *
+    */
+    template< class T >
+    class casi : boost::noncopyable {
+        T val;
+    public:
+        casi(const T& t) : val(t) { 
+            DEV assertStartingUp();
+        }
+        operator const T& () { return val; }
+        T& ref() { 
+            DEV assertStartingUp();
+            return val; 
+        }
+    };
+
+    /** partially specialized for cases where out global variable is a pointer -- we want the value
+     * pointed at to be constant, not just the pointer itself
+     */
+    template< typename T >
+    class casi<T*> : boost::noncopyable {
+        T * val;
+        void operator=(T*);
+    public:
+        casi(T* t) : val(t) { 
+            DEV assertStartingUp();
+        }
+        operator const T* () { return val; }
+        const T* get() { return val; }
+        T*& ref() { 
+            DEV assertStartingUp();
+            return val; 
+        }
+    };
+
+}
diff --git a/src/mongo/db/helpers/dblogger.h b/src/mongo/db/helpers/dblogger.h
new file mode 100644
index 00000000000..4d6ee6d78c4
--- /dev/null
+++ b/src/mongo/db/helpers/dblogger.h
@@ -0,0 +1,31 @@
+// @file db.logger.h
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+namespace mongo {
+
+    /** helper to log (and read log) of a capped collection in the database */
+    class DBLogger {
+        bool _inited;
+    public:
+        const string _ns;
+        DBLogger(string ns) : _inited(false), _ns(ns) { }
+    };
+
+}
diff --git a/src/mongo/db/index.cpp b/src/mongo/db/index.cpp
new file mode 100644
index 00000000000..5eaeab551df
--- /dev/null
+++ b/src/mongo/db/index.cpp
@@ -0,0 +1,446 @@
+/** @file index.cpp */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "namespace-inl.h"
+#include "index.h"
+#include "btree.h"
+#include "background.h"
+#include "repl/rs.h"
+#include "ops/delete.h"
+
+
+namespace mongo {
+
+    template< class V >
+    class IndexInterfaceImpl : public IndexInterface { 
+    public:
+        typedef typename V::KeyOwned KeyOwned;
+        typedef Continuation<V> Cont;
+        virtual int keyCompare(const BSONObj& l,const BSONObj& r, const Ordering &ordering);
+
+        Cont *c[NamespaceDetails::NIndexesMax];
+        int n;
+
+    public:
+        IndexInterfaceImpl() { n = 0; }
+
+        /* lacking CONCURRENCY WRITE this supports only one writer */
+        void _phasedBegin() {
+            // we do this here as phasedFinish can throw exceptions (we could catch there, but just as easy to do here)
+            for( int i = 0; i < n; i++ ) {
+                delete c[i];
+                c[i] = 0; // defensive
+            }
+            n = 0;
+        }
+        void phasedQueueItemToInsert(
+            int idxNo,
+            DiskLoc thisLoc, DiskLoc _recordLoc, const BSONObj &_key,
+            const Ordering& _order, IndexDetails& _idx, bool dupsAllowed) 
+        { 
+            if( idxNo >= n )
+                n = idxNo + 1;
+            Cont *C = c[idxNo] = new Cont(thisLoc, _recordLoc, _key, _order, _idx);
+            thisLoc.btree<V>()->twoStepInsert(thisLoc, *C, dupsAllowed);
+        }
+        void _phasedFinish() {
+            for( int i = 0; i < n; i++ ) {
+                // if mixing v0 and v1 indexes, in that case (only) there could be nulls in the list
+                if( c[i] ) {
+                    c[i]->stepTwo();
+                }
+            }
+        }
+
+/*        virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+            int& pos, bool& found, const DiskLoc &recordLoc, int direction) { 
+            return thisLoc.btree<V>()->locate(idx, thisLoc, key, order, pos, found, recordLoc, direction);
+        }
+        */
+        virtual long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order) { 
+            return thisLoc.btree<V>()->fullValidate(thisLoc, order);
+        }
+        virtual DiskLoc findSingle(const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key) const { 
+            return thisLoc.btree<V>()->findSingle(indexdetails,thisLoc,key);
+        } 
+        virtual bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const {
+            return thisLoc.btree<V>()->unindex(thisLoc, id, key, recordLoc);
+        }
+        virtual int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                      const BSONObj& key, const Ordering &order, bool dupsAllowed,
+                      IndexDetails& idx, bool toplevel = true) const {
+            return thisLoc.btree<V>()->bt_insert(thisLoc, recordLoc, key, order, dupsAllowed, idx, toplevel);
+        }
+        virtual DiskLoc addBucket(const IndexDetails& id) { 
+            return BtreeBucket<V>::addBucket(id);
+        }
+        virtual void uassertIfDups(IndexDetails& idx, vector<BSONObj*>& addedKeys, DiskLoc head, DiskLoc self, const Ordering& ordering) { 
+            const BtreeBucket<V> *h = head.btree<V>();
+            for( vector<BSONObj*>::iterator i = addedKeys.begin(); i != addedKeys.end(); i++ ) {
+                KeyOwned k(**i);
+                bool dup = h->wouldCreateDup(idx, head, k, ordering, self);
+                uassert( 11001 , h->dupKeyError( idx , k ) , !dup);
+            }
+        }
+
+        // for geo:
+        virtual bool isUsed(DiskLoc thisLoc, int pos) { return thisLoc.btree<V>()->isUsed(pos); }
+        virtual void keyAt(DiskLoc thisLoc, int pos, BSONObj& key, DiskLoc& recordLoc) {
+            recordLoc = DiskLoc();
+            const BtreeBucket<V>* bucket = thisLoc.btree<V>();
+            int n = bucket->nKeys();
+
+            if( pos < 0 || pos >= n || n == 0xffff /* bucket deleted */ || ! bucket->isUsed( pos ) ){
+                // log() << "Pos: " << pos << " n " << n << endl;
+                return;
+            }
+
+            typename BtreeBucket<V>::KeyNode kn = bucket->keyNode(pos);
+            key = kn.key.toBson();
+            recordLoc = kn.recordLoc;
+        }
+        virtual BSONObj keyAt(DiskLoc thisLoc, int pos) {
+            return thisLoc.btree<V>()->keyAt(pos).toBson();
+        }
+        virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+                int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) { 
+            return thisLoc.btree<V>()->locate(idx, thisLoc, key, order, pos, found, recordLoc, direction);
+        }
+        virtual DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) { 
+            return thisLoc.btree<V>()->advance(thisLoc,keyOfs,direction,caller);
+        }
+    };
+
+    int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o); // key.cpp
+
+    template <>
+    int IndexInterfaceImpl< V0 >::keyCompare(const BSONObj& l, const BSONObj& r, const Ordering &ordering) { 
+        return oldCompare(l, r, ordering);
+    }
+
+    template <>
+    int IndexInterfaceImpl< V1 >::keyCompare(const BSONObj& l, const BSONObj& r, const Ordering &ordering) { 
+        return l.woCompare(r, ordering, /*considerfieldname*/false);
+    }
+
+    IndexInterfaceImpl<V0> iii_v0;
+    IndexInterfaceImpl<V1> iii_v1;
+
+    IndexInterface *IndexDetails::iis[] = { &iii_v0, &iii_v1 };
+
+    void IndexInterface::phasedBegin() { 
+        iii_v0._phasedBegin();
+        iii_v1._phasedBegin();
+    }
+    void IndexInterface::phasedFinish() { 
+        iii_v0._phasedFinish();
+        iii_v1._phasedFinish();
+    }
+
+    int removeFromSysIndexes(const char *ns, const char *idxName) {
+        string system_indexes = cc().database()->name + ".system.indexes";
+        BSONObjBuilder b;
+        b.append("ns", ns);
+        b.append("name", idxName); // e.g.: { name: "ts_1", ns: "foo.coll" }
+        BSONObj cond = b.done();
+        return (int) deleteObjects(system_indexes.c_str(), cond, false, false, true);
+    }
+
+    /* this is just an attempt to clean up old orphaned stuff on a delete all indexes
+       call. repair database is the clean solution, but this gives one a lighter weight
+       partial option.  see dropIndexes()
+    */
+    void assureSysIndexesEmptied(const char *ns, IndexDetails *idIndex) {
+        string system_indexes = cc().database()->name + ".system.indexes";
+        BSONObjBuilder b;
+        b.append("ns", ns);
+        if( idIndex ) {
+            b.append("name", BSON( "$ne" << idIndex->indexName().c_str() ));
+        }
+        BSONObj cond = b.done();
+        int n = (int) deleteObjects(system_indexes.c_str(), cond, false, false, true);
+        if( n ) {
+            log() << "info: assureSysIndexesEmptied cleaned up " << n << " entries" << endl;
+        }
+    }
+
+    int IndexDetails::keyPatternOffset( const string& key ) const {
+        BSONObjIterator i( keyPattern() );
+        int n = 0;
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            if ( key == e.fieldName() )
+                return n;
+            n++;
+        }
+        return -1;
+    }
+
+    const IndexSpec& IndexDetails::getSpec() const {
+        SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+        return NamespaceDetailsTransient::get_inlock( info.obj()["ns"].valuestr() ).getIndexSpec( this );
+    }
+
+    /* delete this index.  does NOT clean up the system catalog
+       (system.indexes or system.namespaces) -- only NamespaceIndex.
+    */
+    void IndexDetails::kill_idx() {
+        string ns = indexNamespace(); // e.g. foo.coll.$ts_1
+        try {
+
+            string pns = parentNS(); // note we need a copy, as parentNS() won't work after the drop() below
+
+            // clean up parent namespace index cache
+            NamespaceDetailsTransient::get( pns.c_str() ).deletedIndex();
+
+            string name = indexName();
+
+            /* important to catch exception here so we can finish cleanup below. */
+            try {
+                dropNS(ns.c_str());
+            }
+            catch(DBException& ) {
+                log(2) << "IndexDetails::kill(): couldn't drop ns " << ns << endl;
+            }
+            head.setInvalid();
+            info.setInvalid();
+
+            // clean up in system.indexes.  we do this last on purpose.
+            int n = removeFromSysIndexes(pns.c_str(), name.c_str());
+            wassert( n == 1 );
+
+        }
+        catch ( DBException &e ) {
+            log() << "exception in kill_idx: " << e << ", ns: " << ns << endl;
+        }
+    }
+
+    void IndexDetails::getKeysFromObject( const BSONObj& obj, BSONObjSet& keys) const {
+        getSpec().getKeys( obj, keys );
+    }
+
+    void setDifference(BSONObjSet &l, BSONObjSet &r, vector<BSONObj*> &diff) {
+        // l and r must use the same ordering spec.
+        verify( 14819, l.key_comp().order() == r.key_comp().order() );
+        BSONObjSet::iterator i = l.begin();
+        BSONObjSet::iterator j = r.begin();
+        while ( 1 ) {
+            if ( i == l.end() )
+                break;
+            while ( j != r.end() && j->woCompare( *i ) < 0 )
+                j++;
+            if ( j == r.end() || i->woCompare(*j) != 0  ) {
+                const BSONObj *jo = &*i;
+                diff.push_back( (BSONObj *) jo );
+            }
+            i++;
+        }
+    }
+
+    void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj, bool &changedId) {
+        int z = d.nIndexesBeingBuilt();
+        v.resize(z);
+        for( int i = 0; i < z; i++ ) {
+            IndexDetails& idx = d.idx(i);
+            BSONObj idxKey = idx.info.obj().getObjectField("key"); // eg { ts : 1 }
+            IndexChanges& ch = v[i];
+            idx.getKeysFromObject(oldObj, ch.oldkeys);
+            idx.getKeysFromObject(newObj, ch.newkeys);
+            if( ch.newkeys.size() > 1 )
+                d.setIndexIsMultikey(i);
+            setDifference(ch.oldkeys, ch.newkeys, ch.removed);
+            setDifference(ch.newkeys, ch.oldkeys, ch.added);
+            if ( ch.removed.size() > 0 && ch.added.size() > 0 && idx.isIdIndex() ) {
+                changedId = true;
+            }
+        }
+    }
+
+    void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d, DiskLoc curObjLoc) {
+        int z = d.nIndexesBeingBuilt();
+        for( int i = 0; i < z; i++ ) {
+            IndexDetails& idx = d.idx(i);
+            v[i].dupCheck(idx, curObjLoc);
+        }
+    }
+
+    // should be { <something> : <simpletype[1|-1]>, .keyp.. }
+    static bool validKeyPattern(BSONObj kp) {
+        BSONObjIterator i(kp);
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if( e.type() == Object || e.type() == Array )
+                return false;
+        }
+        return true;
+    }
+
+    /* Prepare to build an index.  Does not actually build it (except for a special _id case).
+       - We validate that the params are good
+       - That the index does not already exist
+       - Creates the source collection if it DNE
+
+       example of 'io':
+         { ns : 'test.foo', name : 'z', key : { z : 1 } }
+
+       throws DBException
+
+       @param sourceNS - source NS we are indexing
+       @param sourceCollection - its details ptr
+       @return true if ok to continue.  when false we stop/fail silently (index already exists)
+    */
+    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ) {
+        sourceCollection = 0;
+
+        // logical name of the index.  todo: get rid of the name, we don't need it!
+        const char *name = io.getStringField("name");
+        uassert(12523, "no index name specified", *name);
+
+        // the collection for which we are building an index
+        sourceNS = io.getStringField("ns");
+        uassert(10096, "invalid ns to index", sourceNS.find( '.' ) != string::npos);
+        uassert(10097, "bad table to index name on add index attempt",
+                cc().database()->name == nsToDatabase(sourceNS.c_str()));
+
+        BSONObj key = io.getObjectField("key");
+        uassert(12524, "index key pattern too large", key.objsize() <= 2048);
+        if( !validKeyPattern(key) ) {
+            string s = string("bad index key pattern ") + key.toString();
+            uasserted(10098 , s.c_str());
+        }
+
+        if ( sourceNS.empty() || key.isEmpty() ) {
+            log(2) << "bad add index attempt name:" << (name?name:"") << "\n  ns:" <<
+                   sourceNS << "\n  idxobj:" << io.toString() << endl;
+            string s = "bad add index attempt " + sourceNS + " key:" + key.toString();
+            uasserted(12504, s);
+        }
+
+        sourceCollection = nsdetails(sourceNS.c_str());
+        if( sourceCollection == 0 ) {
+            // try to create it
+            string err;
+            if ( !userCreateNS(sourceNS.c_str(), BSONObj(), err, false) ) {
+                problem() << "ERROR: failed to create collection while adding its index. " << sourceNS << endl;
+                return false;
+            }
+            sourceCollection = nsdetails(sourceNS.c_str());
+            tlog() << "info: creating collection " << sourceNS << " on add index" << endl;
+            assert( sourceCollection );
+        }
+
+        if ( sourceCollection->findIndexByName(name) >= 0 ) {
+            // index already exists.
+            return false;
+        }
+        if( sourceCollection->findIndexByKeyPattern(key) >= 0 ) {
+            log(2) << "index already exists with diff name " << name << ' ' << key.toString() << endl;
+            return false;
+        }
+
+        if ( sourceCollection->nIndexes >= NamespaceDetails::NIndexesMax ) {
+            stringstream ss;
+            ss << "add index fails, too many indexes for " << sourceNS << " key:" << key.toString();
+            string s = ss.str();
+            log() << s << '\n';
+            uasserted(12505,s);
+        }
+
+        /* we can't build a new index for the ns if a build is already in progress in the background -
+           EVEN IF this is a foreground build.
+           */
+        uassert(12588, "cannot add index with a background operation in progress",
+                !BackgroundOperation::inProgForNs(sourceNS.c_str()));
+
+        /* this is because we want key patterns like { _id : 1 } and { _id : <someobjid> } to
+           all be treated as the same pattern.
+        */
+        if ( IndexDetails::isIdIndexPattern(key) ) {
+            if( !god ) {
+                ensureHaveIdIndex( sourceNS.c_str() );
+                return false;
+            }
+        }
+        else {
+            /* is buildIndexes:false set for this replica set member?
+               if so we don't build any indexes except _id
+            */
+            if( theReplSet && !theReplSet->buildIndexes() )
+                return false;
+        }
+
+        string pluginName = IndexPlugin::findPluginName( key );
+        IndexPlugin * plugin = pluginName.size() ? IndexPlugin::get( pluginName ) : 0;
+
+
+        { 
+            BSONObj o = io;
+            if ( plugin ) {
+                o = plugin->adjustIndexSpec(o);
+            }
+            BSONObjBuilder b;
+            int v = DefaultIndexVersionNumber;
+            if( !o["v"].eoo() ) {
+                double vv = o["v"].Number();
+                // note (one day) we may be able to fresh build less versions than we can use
+                // isASupportedIndexVersionNumber() is what we can use
+                uassert(14803, str::stream() << "this version of mongod cannot build new indexes of version number " << vv, 
+                    vv == 0 || vv == 1);
+                v = (int) vv;
+            }
+            // idea is to put things we use a lot earlier
+            b.append("v", v);
+            b.append(o["key"]);
+            if( o["unique"].trueValue() )
+                b.appendBool("unique", true); // normalize to bool true in case was int 1 or something...
+            b.append(o["ns"]);
+
+            {
+                // stripping _id
+                BSONObjIterator i(o);
+                while ( i.more() ) {
+                    BSONElement e = i.next();
+                    string s = e.fieldName();
+                    if( s != "_id" && s != "v" && s != "ns" && s != "unique" && s != "key" )
+                        b.append(e);
+                }
+            }
+        
+            fixedIndexObject = b.obj();
+        }
+
+        return true;
+    }
+
+    void IndexSpec::reset( const IndexDetails * details ) {
+        _details = details;
+        reset( details->info );
+    }
+
+    void IndexSpec::reset( const BSONObj& _info ) {
+        info = _info;
+        keyPattern = info["key"].embeddedObjectUserCheck();
+        if ( keyPattern.objsize() == 0 ) {
+            out() << info.toString() << endl;
+            assert(false);
+        }
+        _init();
+    }
+
+}
diff --git a/src/mongo/db/index.h b/src/mongo/db/index.h
new file mode 100644
index 00000000000..d297f8a4ca1
--- /dev/null
+++ b/src/mongo/db/index.h
@@ -0,0 +1,237 @@
+// index.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "diskloc.h"
+#include "jsobj.h"
+#include "indexkey.h"
+#include "key.h"
+
+namespace mongo {
+
+    class IndexInterface {
+    protected:
+        virtual ~IndexInterface() { }
+    public:
+        static void phasedBegin();
+        virtual void phasedQueueItemToInsert(
+            int idxNo,
+            DiskLoc thisLoc, DiskLoc _recordLoc, const BSONObj &_key,
+            const Ordering& _order, IndexDetails& _idx, bool dupsAllowed) = 0;
+        static void phasedFinish();
+
+        virtual int keyCompare(const BSONObj& l,const BSONObj& r, const Ordering &ordering) = 0;
+        virtual long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order) = 0;
+        virtual DiskLoc findSingle(const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key) const = 0;
+        virtual bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const = 0;
+        virtual int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+            const BSONObj& key, const Ordering &order, bool dupsAllowed,
+            IndexDetails& idx, bool toplevel = true) const = 0;
+        virtual DiskLoc addBucket(const IndexDetails&) = 0;
+        virtual void uassertIfDups(IndexDetails& idx, vector<BSONObj*>& addedKeys, DiskLoc head, 
+            DiskLoc self, const Ordering& ordering) = 0;
+
+        // these are for geo
+        virtual bool isUsed(DiskLoc thisLoc, int pos) = 0;
+        virtual void keyAt(DiskLoc thisLoc, int pos, BSONObj&, DiskLoc& recordLoc) = 0;
+        virtual BSONObj keyAt(DiskLoc thisLoc, int pos) = 0;
+        virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+                               int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) = 0;
+        virtual DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) = 0;
+    };
+
+    /* Details about a particular index. There is one of these effectively for each object in
+       system.namespaces (although this also includes the head pointer, which is not in that
+       collection).
+
+       ** MemoryMapped Record ** (i.e., this is on disk data)
+     */
+    class IndexDetails {
+    public:
+        /**
+         * btree head disk location
+         * TODO We should make this variable private, since btree operations
+         * may change its value and we don't want clients to rely on an old
+         * value.  If we create a btree class, we can provide a btree object
+         * to clients instead of 'head'.
+         */
+        DiskLoc head;
+
+        /* Location of index info object. Format:
+
+             { name:"nameofindex", ns:"parentnsname", key: {keypattobject}
+               [, unique: <bool>, background: <bool>, v:<version>]
+             }
+
+           This object is in the system.indexes collection.  Note that since we
+           have a pointer to the object here, the object in system.indexes MUST NEVER MOVE.
+        */
+        DiskLoc info;
+
+        /* extract key value from the query object
+           e.g., if key() == { x : 1 },
+                 { x : 70, y : 3 } -> { x : 70 }
+        */
+        BSONObj getKeyFromQuery(const BSONObj& query) const {
+            BSONObj k = keyPattern();
+            BSONObj res = query.extractFieldsUnDotted(k);
+            return res;
+        }
+
+        /* pull out the relevant key objects from obj, so we
+           can index them.  Note that the set is multiple elements
+           only when it's a "multikey" array.
+           keys will be left empty if key not found in the object.
+        */
+        void getKeysFromObject( const BSONObj& obj, BSONObjSet& keys) const;
+
+        /* get the key pattern for this object.
+           e.g., { lastname:1, firstname:1 }
+        */
+        BSONObj keyPattern() const {
+            return info.obj().getObjectField("key");
+        }
+
+        /**
+         * @return offset into keyPattern for key
+                   -1 if doesn't exist
+         */
+        int keyPatternOffset( const string& key ) const;
+        bool inKeyPattern( const string& key ) const { return keyPatternOffset( key ) >= 0; }
+
+        /* true if the specified key is in the index */
+        bool hasKey(const BSONObj& key);
+
+        // returns name of this index's storage area
+        // database.table.$index
+        string indexNamespace() const {
+            BSONObj io = info.obj();
+            string s;
+            s.reserve(Namespace::MaxNsLen);
+            s = io.getStringField("ns");
+            assert( !s.empty() );
+            s += ".$";
+            s += io.getStringField("name");
+            return s;
+        }
+
+        string indexName() const { // e.g. "ts_1"
+            BSONObj io = info.obj();
+            return io.getStringField("name");
+        }
+
+        static bool isIdIndexPattern( const BSONObj &pattern ) {
+            BSONObjIterator i(pattern);
+            BSONElement e = i.next();
+            if( strcmp(e.fieldName(), "_id") != 0 ) return false;
+            return i.next().eoo();
+        }
+
+        /* returns true if this is the _id index. */
+        bool isIdIndex() const {
+            return isIdIndexPattern( keyPattern() );
+        }
+
+        /* gets not our namespace name (indexNamespace for that),
+           but the collection we index, its name.
+           */
+        string parentNS() const {
+            BSONObj io = info.obj();
+            return io.getStringField("ns");
+        }
+
+        static int versionForIndexObj( const BSONObj &obj ) {
+            BSONElement e = obj["v"];
+            if( e.type() == NumberInt ) 
+                return e._numberInt();
+            // should normally be an int.  this is for backward compatibility
+            int v = e.numberInt();
+            uassert(14802, "index v field should be Integer type", v == 0);
+            return v;            
+        }
+        
+        int version() const {
+            return versionForIndexObj( info.obj() );
+        }
+
+        /** @return true if index has unique constraint */
+        bool unique() const {
+            BSONObj io = info.obj();
+            return io["unique"].trueValue() ||
+                   /* temp: can we juse make unique:true always be there for _id and get rid of this? */
+                   isIdIndex();
+        }
+
+        /** return true if dropDups was set when building index (if any duplicates, dropdups drops the duplicating objects) */
+        bool dropDups() const {
+            return info.obj().getBoolField( "dropDups" );
+        }
+
+        /** delete this index.  does NOT clean up the system catalog
+            (system.indexes or system.namespaces) -- only NamespaceIndex.
+        */
+        void kill_idx();
+
+        const IndexSpec& getSpec() const;
+
+        string toString() const {
+            return info.obj().toString();
+        }
+
+        /** @return true if supported.  supported means we can use the index, including adding new keys.
+                    it may not mean we can build the index version in question: we may not maintain building 
+                    of indexes in old formats in the future.
+        */
+        static bool isASupportedIndexVersionNumber(int v) { return (v&1)==v; } // v == 0 || v == 1
+
+        /** @return the interface for this interface, which varies with the index version.
+            used for backward compatibility of index versions/formats.
+        */
+        IndexInterface& idxInterface() const { 
+            int v = version();
+            dassert( isASupportedIndexVersionNumber(v) );
+            return *iis[v&1];
+        }
+
+        static IndexInterface *iis[];
+    };
+
+    struct IndexChanges { /*on an update*/
+        BSONObjSet oldkeys;
+        BSONObjSet newkeys;
+        vector<BSONObj*> removed; // these keys were removed as part of the change
+        vector<BSONObj*> added;   // these keys were added as part of the change
+
+        /** @curObjLoc - the object we want to add's location.  if it is already in the
+                         index, that is allowed here (for bg indexing case).
+        */
+        void dupCheck(IndexDetails& idx, DiskLoc curObjLoc) {
+            if( added.empty() || !idx.unique() )
+                return;
+            const Ordering ordering = Ordering::make(idx.keyPattern());
+            idx.idxInterface().uassertIfDups(idx, added, idx.head, curObjLoc, ordering); // "E11001 duplicate key on update"
+        }
+    };
+
+    class NamespaceDetails;
+    // changedId should be initialized to false
+    void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj, bool &cangedId);
+    void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d, DiskLoc curObjLoc);
+} // namespace mongo
diff --git a/src/mongo/db/indexkey.cpp b/src/mongo/db/indexkey.cpp
new file mode 100644
index 00000000000..18dfcb079b9
--- /dev/null
+++ b/src/mongo/db/indexkey.cpp
@@ -0,0 +1,462 @@
+// index_key.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "namespace-inl.h"
+#include "index.h"
+#include "btree.h"
+#include "ops/query.h"
+#include "background.h"
+#include "../util/text.h"
+
+namespace mongo {
+
+    /** old (<= v1.8) : 0
+     1 is new version
+     */
+    const int DefaultIndexVersionNumber = 1;
+    
+    map<string,IndexPlugin*> * IndexPlugin::_plugins;
+
+    IndexType::IndexType( const IndexPlugin * plugin , const IndexSpec * spec )
+        : _plugin( plugin ) , _spec( spec ) {
+
+    }
+
+    IndexType::~IndexType() {
+    }
+
+    const BSONObj& IndexType::keyPattern() const {
+        return _spec->keyPattern;
+    }
+
+    IndexPlugin::IndexPlugin( const string& name )
+        : _name( name ) {
+        if ( ! _plugins )
+            _plugins = new map<string,IndexPlugin*>();
+        (*_plugins)[name] = this;
+    }
+
+    string IndexPlugin::findPluginName( const BSONObj& keyPattern ) {
+        string pluginName = "";
+
+        BSONObjIterator i( keyPattern );
+
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if ( e.type() != String )
+                continue;
+
+            uassert( 13007 , "can only have 1 index plugin / bad index key pattern" , pluginName.size() == 0 || pluginName == e.String() );
+            pluginName = e.String();
+        }
+
+        return pluginName;
+    }
+
+    int IndexType::compare( const BSONObj& l , const BSONObj& r ) const {
+        return l.woCompare( r , _spec->keyPattern );
+    }
+
+    void IndexSpec::_init() {
+        assert( keyPattern.objsize() );
+
+        // some basics
+        _nFields = keyPattern.nFields();
+        _sparse = info["sparse"].trueValue();
+        uassert( 13529 , "sparse only works for single field keys" , ! _sparse || _nFields );
+
+
+        {
+            // build _nullKey
+
+            BSONObjBuilder b;
+            BSONObjIterator i( keyPattern );
+
+            while( i.more() ) {
+                BSONElement e = i.next();
+                _fieldNames.push_back( e.fieldName() );
+                _fixed.push_back( BSONElement() );
+                b.appendNull( "" );
+            }
+            _nullKey = b.obj();
+        }
+
+        {
+            // _nullElt
+            BSONObjBuilder b;
+            b.appendNull( "" );
+            _nullObj = b.obj();
+            _nullElt = _nullObj.firstElement();
+        }
+
+        {
+            // _undefinedElt
+            BSONObjBuilder b;
+            b.appendUndefined( "" );
+            _undefinedObj = b.obj();
+            _undefinedElt = _undefinedObj.firstElement();
+        }
+        
+        {
+            // handle plugins
+            string pluginName = IndexPlugin::findPluginName( keyPattern );
+            if ( pluginName.size() ) {
+                IndexPlugin * plugin = IndexPlugin::get( pluginName );
+                if ( ! plugin ) {
+                    log() << "warning: can't find plugin [" << pluginName << "]" << endl;
+                }
+                else {
+                    _indexType.reset( plugin->generate( this ) );
+                }
+            }
+        }
+
+        _finishedInit = true;
+    }
+
+    void assertParallelArrays( const char *first, const char *second ) {
+        stringstream ss;
+        ss << "cannot index parallel arrays [" << first << "] [" << second << "]";
+        uasserted( ParallelArraysCode ,  ss.str() );
+    }
+    
+    class KeyGeneratorV0 {
+    public:
+        KeyGeneratorV0( const IndexSpec &spec ) : _spec( spec ) {}
+        
+        void getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+            if ( _spec._indexType.get() ) { //plugin (eg geo)
+                _spec._indexType->getKeys( obj , keys );
+                return;
+            }
+            vector<const char*> fieldNames( _spec._fieldNames );
+            vector<BSONElement> fixed( _spec._fixed );
+            _getKeys( fieldNames , fixed , obj, keys );
+            if ( keys.empty() && ! _spec._sparse )
+                keys.insert( _spec._nullKey );
+        }
+        
+    private:
+        void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys ) const {
+            BSONElement arrElt;
+            unsigned arrIdx = ~0;
+            int numNotFound = 0;
+            
+            for( unsigned i = 0; i < fieldNames.size(); ++i ) {
+                if ( *fieldNames[ i ] == '\0' )
+                    continue;
+                
+                BSONElement e = obj.getFieldDottedOrArray( fieldNames[ i ] );
+                
+                if ( e.eoo() ) {
+                    e = _spec._nullElt; // no matching field
+                    numNotFound++;
+                }
+                
+                if ( e.type() != Array )
+                    fieldNames[ i ] = ""; // no matching field or non-array match
+                
+                if ( *fieldNames[ i ] == '\0' )
+                    fixed[ i ] = e; // no need for further object expansion (though array expansion still possible)
+                
+                if ( e.type() == Array && arrElt.eoo() ) { // we only expand arrays on a single path -- track the path here
+                    arrIdx = i;
+                    arrElt = e;
+                }
+                
+                // enforce single array path here
+                if ( e.type() == Array && e.rawdata() != arrElt.rawdata() ) {
+                    assertParallelArrays( e.fieldName(), arrElt.fieldName() );
+                }
+            }
+            
+            bool allFound = true; // have we found elements for all field names in the key spec?
+            for( vector<const char*>::const_iterator i = fieldNames.begin(); i != fieldNames.end(); ++i ) {
+                if ( **i != '\0' ) {
+                    allFound = false;
+                    break;
+                }
+            }
+            
+            if ( _spec._sparse && numNotFound == _spec._nFields ) {
+                // we didn't find any fields
+                // so we're not going to index this document
+                return;
+            }
+            
+            bool insertArrayNull = false;
+            
+            if ( allFound ) {
+                if ( arrElt.eoo() ) {
+                    // no terminal array element to expand
+                    BSONObjBuilder b(_spec._sizeTracker);
+                    for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i )
+                        b.appendAs( *i, "" );
+                    keys.insert( b.obj() );
+                }
+                else {
+                    // terminal array element to expand, so generate all keys
+                    BSONObjIterator i( arrElt.embeddedObject() );
+                    if ( i.more() ) {
+                        while( i.more() ) {
+                            BSONObjBuilder b(_spec._sizeTracker);
+                            for( unsigned j = 0; j < fixed.size(); ++j ) {
+                                if ( j == arrIdx )
+                                    b.appendAs( i.next(), "" );
+                                else
+                                    b.appendAs( fixed[ j ], "" );
+                            }
+                            keys.insert( b.obj() );
+                        }
+                    }
+                    else if ( fixed.size() > 1 ) {
+                        insertArrayNull = true;
+                    }
+                }
+            }
+            else {
+                // nonterminal array element to expand, so recurse
+                assert( !arrElt.eoo() );
+                BSONObjIterator i( arrElt.embeddedObject() );
+                if ( i.more() ) {
+                    while( i.more() ) {
+                        BSONElement e = i.next();
+                        if ( e.type() == Object ) {
+                            _getKeys( fieldNames, fixed, e.embeddedObject(), keys );
+                        }
+                    }
+                }
+                else {
+                    insertArrayNull = true;
+                }
+            }
+            
+            if ( insertArrayNull ) {
+                // x : [] - need to insert undefined
+                BSONObjBuilder b(_spec._sizeTracker);
+                for( unsigned j = 0; j < fixed.size(); ++j ) {
+                    if ( j == arrIdx ) {
+                        b.appendUndefined( "" );
+                    }
+                    else {
+                        BSONElement e = fixed[j];
+                        if ( e.eoo() )
+                            b.appendNull( "" );
+                        else
+                            b.appendAs( e , "" );
+                    }
+                }
+                keys.insert( b.obj() );
+            }
+        }
+        
+        const IndexSpec &_spec;
+    };
+
+    class KeyGeneratorV1 {
+    public:
+        KeyGeneratorV1( const IndexSpec &spec ) : _spec( spec ) {}
+        
+        void getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+            if ( _spec._indexType.get() ) { //plugin (eg geo)
+                _spec._indexType->getKeys( obj , keys );
+                return;
+            }
+            vector<const char*> fieldNames( _spec._fieldNames );
+            vector<BSONElement> fixed( _spec._fixed );
+            _getKeys( fieldNames , fixed , obj, keys );
+            if ( keys.empty() && ! _spec._sparse )
+                keys.insert( _spec._nullKey );
+        }     
+        
+    private:
+        /**
+         * @param arrayNestedArray - set if the returned element is an array nested directly within arr.
+         */
+        BSONElement extractNextElement( const BSONObj &obj, const BSONObj &arr, const char *&field, bool &arrayNestedArray ) const {
+            string firstField = mongoutils::str::before( field, '.' );
+            bool haveObjField = !obj.getField( firstField ).eoo();
+            BSONElement arrField = arr.getField( firstField );
+            bool haveArrField = !arrField.eoo();
+
+            // An index component field name cannot exist in both a document array and one of that array's children.
+            uassert( 15855 ,  str::stream() << "Ambiguous field name found in array (do not use numeric field names in embedded elements in an array), field: '" << arrField.fieldName() << "' for array: " << arr, !haveObjField || !haveArrField );
+
+            arrayNestedArray = false;
+			if ( haveObjField ) {
+                return obj.getFieldDottedOrArray( field );
+            }
+            else if ( haveArrField ) {
+                if ( arrField.type() == Array ) {
+                    arrayNestedArray = true;
+                }
+                return arr.getFieldDottedOrArray( field );
+            }
+            return BSONElement();
+        }
+        
+        void _getKeysArrEltFixed( vector<const char*> &fieldNames , vector<BSONElement> &fixed , const BSONElement &arrEntry, BSONObjSet &keys, int numNotFound, const BSONElement &arrObjElt, const set< unsigned > &arrIdxs, bool mayExpandArrayUnembedded ) const {
+            // set up any terminal array values
+            for( set<unsigned>::const_iterator j = arrIdxs.begin(); j != arrIdxs.end(); ++j ) {
+                if ( *fieldNames[ *j ] == '\0' ) {
+                    fixed[ *j ] = mayExpandArrayUnembedded ? arrEntry : arrObjElt;
+                }
+            }
+            // recurse
+            _getKeys( fieldNames, fixed, ( arrEntry.type() == Object ) ? arrEntry.embeddedObject() : BSONObj(), keys, numNotFound, arrObjElt.embeddedObject() );        
+        }
+        
+        /**
+         * @param fieldNames - fields to index, may be postfixes in recursive calls
+         * @param fixed - values that have already been identified for their index fields
+         * @param obj - object from which keys should be extracted, based on names in fieldNames
+         * @param keys - set where index keys are written
+         * @param numNotFound - number of index fields that have already been identified as missing
+         * @param array - array from which keys should be extracted, based on names in fieldNames
+         *        If obj and array are both nonempty, obj will be one of the elements of array.
+         */        
+        void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys, int numNotFound = 0, const BSONObj &array = BSONObj() ) const {
+            BSONElement arrElt;
+            set<unsigned> arrIdxs;
+            bool mayExpandArrayUnembedded = true;
+            for( unsigned i = 0; i < fieldNames.size(); ++i ) {
+                if ( *fieldNames[ i ] == '\0' ) {
+                    continue;
+                }
+                
+                bool arrayNestedArray;
+                // Extract element matching fieldName[ i ] from object xor array.
+                BSONElement e = extractNextElement( obj, array, fieldNames[ i ], arrayNestedArray );
+                
+                if ( e.eoo() ) {
+                    // if field not present, set to null
+                    fixed[ i ] = _spec._nullElt;
+                    // done expanding this field name
+                    fieldNames[ i ] = "";
+                    numNotFound++;
+                }
+                else if ( e.type() == Array ) {
+                    arrIdxs.insert( i );
+                    if ( arrElt.eoo() ) {
+                        // we only expand arrays on a single path -- track the path here
+                        arrElt = e;
+                    }
+                    else if ( e.rawdata() != arrElt.rawdata() ) {
+                        // enforce single array path here
+                        assertParallelArrays( e.fieldName(), arrElt.fieldName() );
+                    }
+                    if ( arrayNestedArray ) {
+                        mayExpandArrayUnembedded = false;   
+                    }
+                }
+                else {
+                    // not an array - no need for further expansion
+                    fixed[ i ] = e;
+                }
+            }
+            
+            if ( arrElt.eoo() ) {
+                // No array, so generate a single key.
+                if ( _spec._sparse && numNotFound == _spec._nFields ) {
+                    return;
+                }            
+                BSONObjBuilder b(_spec._sizeTracker);
+                for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i ) {
+                    b.appendAs( *i, "" );
+                }
+                keys.insert( b.obj() );
+            }
+            else if ( arrElt.embeddedObject().firstElement().eoo() ) {
+                // Empty array, so set matching fields to undefined.
+                _getKeysArrEltFixed( fieldNames, fixed, _spec._undefinedElt, keys, numNotFound, arrElt, arrIdxs, true );
+            }
+            else {
+                // Non empty array that can be expanded, so generate a key for each member.
+                BSONObj arrObj = arrElt.embeddedObject();
+                BSONObjIterator i( arrObj );
+                while( i.more() ) {
+                    _getKeysArrEltFixed( fieldNames, fixed, i.next(), keys, numNotFound, arrElt, arrIdxs, mayExpandArrayUnembedded );
+                }
+            }
+        }
+        
+        const IndexSpec &_spec;
+    };
+    
+    void IndexSpec::getKeys( const BSONObj &obj, BSONObjSet &keys ) const {
+        switch( indexVersion() ) {
+            case 0: {
+                KeyGeneratorV0 g( *this );
+                g.getKeys( obj, keys );
+                break;
+            }
+            case 1: {
+                KeyGeneratorV1 g( *this );
+                g.getKeys( obj, keys );
+                break;
+            }
+            default:
+                massert( 15869, "Invalid index version for key generation.", false );
+        }
+    }
+
+    bool anyElementNamesMatch( const BSONObj& a , const BSONObj& b ) {
+        BSONObjIterator x(a);
+        while ( x.more() ) {
+            BSONElement e = x.next();
+            BSONObjIterator y(b);
+            while ( y.more() ) {
+                BSONElement f = y.next();
+                FieldCompareResult res = compareDottedFieldNames( e.fieldName() , f.fieldName() );
+                if ( res == SAME || res == LEFT_SUBFIELD || res == RIGHT_SUBFIELD )
+                    return true;
+            }
+        }
+        return false;
+    }
+
+    IndexSuitability IndexSpec::suitability( const BSONObj& query , const BSONObj& order ) const {
+        if ( _indexType.get() )
+            return _indexType->suitability( query , order );
+        return _suitability( query , order );
+    }
+
+    IndexSuitability IndexSpec::_suitability( const BSONObj& query , const BSONObj& order ) const {
+        // TODO: optimize
+        if ( anyElementNamesMatch( keyPattern , query ) == 0 && anyElementNamesMatch( keyPattern , order ) == 0 )
+            return USELESS;
+        return HELPFUL;
+    }
+
+    IndexSuitability IndexType::suitability( const BSONObj& query , const BSONObj& order ) const {
+        return _spec->_suitability( query , order );
+    }
+    
+    int IndexSpec::indexVersion() const {
+        if ( !info.hasField( "v" ) ) {
+            return DefaultIndexVersionNumber;
+        }
+        return IndexDetails::versionForIndexObj( info );
+    }    
+
+    bool IndexType::scanAndOrderRequired( const BSONObj& query , const BSONObj& order ) const {
+        return ! order.isEmpty();
+    }
+
+}
diff --git a/src/mongo/db/indexkey.h b/src/mongo/db/indexkey.h
new file mode 100644
index 00000000000..12cd755e8a0
--- /dev/null
+++ b/src/mongo/db/indexkey.h
@@ -0,0 +1,198 @@
+// index_key.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "diskloc.h"
+#include "jsobj.h"
+#include <map>
+
+namespace mongo {
+
+    extern const int DefaultIndexVersionNumber;
+
+    const int ParallelArraysCode = 10088;
+    
+    class Cursor;
+    class IndexSpec;
+    class IndexType; // TODO: this name sucks
+    class IndexPlugin;
+    class IndexDetails;
+
+    enum IndexSuitability { USELESS = 0 , HELPFUL = 1 , OPTIMAL = 2 };
+
+    /**
+     * this represents an instance of a index plugin
+     * done this way so parsing, etc... can be cached
+     * so if there is a FTS IndexPlugin, for each index using FTS
+     * there will be 1 of these, and it can have things pre-parsed, etc...
+     */
+    class IndexType : boost::noncopyable {
+    public:
+        IndexType( const IndexPlugin * plugin , const IndexSpec * spec );
+        virtual ~IndexType();
+
+        virtual void getKeys( const BSONObj &obj, BSONObjSet &keys ) const = 0;
+        virtual shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const = 0;
+
+        /** optional op : changes query to match what's in the index */
+        virtual BSONObj fixKey( const BSONObj& in ) { return in; }
+
+        /** optional op : compare 2 objects with regards to this index */
+        virtual int compare( const BSONObj& l , const BSONObj& r ) const;
+
+        /** @return plugin */
+        const IndexPlugin * getPlugin() const { return _plugin; }
+
+        const BSONObj& keyPattern() const;
+
+        virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const ;
+
+        virtual bool scanAndOrderRequired( const BSONObj& query , const BSONObj& order ) const ;
+
+    protected:
+        const IndexPlugin * _plugin;
+        const IndexSpec * _spec;
+    };
+
+    /**
+     * this represents a plugin
+     * a plugin could be something like full text search, sparse index, etc...
+     * 1 of these exists per type of index per server
+     * 1 IndexType is created per index using this plugin
+     */
+    class IndexPlugin : boost::noncopyable {
+    public:
+        IndexPlugin( const string& name );
+        virtual ~IndexPlugin() {}
+
+        virtual IndexType* generate( const IndexSpec * spec ) const = 0;
+
+        string getName() const { return _name; }
+
+        /**
+         * @return new keyPattern
+         * if nothing changes, should return keyPattern
+         */
+        virtual BSONObj adjustIndexSpec( const BSONObj& spec ) const { return spec; }
+
+        // ------- static below -------
+
+        static IndexPlugin* get( const string& name ) {
+            if ( ! _plugins )
+                return 0;
+            map<string,IndexPlugin*>::iterator i = _plugins->find( name );
+            if ( i == _plugins->end() )
+                return 0;
+            return i->second;
+        }
+
+        /**
+         * @param keyPattern { x : "fts" }
+         * @return "" or the name
+         */
+        static string findPluginName( const BSONObj& keyPattern );
+
+    private:
+        string _name;
+        static map<string,IndexPlugin*> * _plugins;
+    };
+
+    /* precomputed details about an index, used for inserting keys on updates
+       stored/cached in NamespaceDetailsTransient, or can be used standalone
+       */
+    class IndexSpec {
+    public:
+        BSONObj keyPattern; // e.g., { name : 1 }
+        BSONObj info; // this is the same as IndexDetails::info.obj()
+
+        IndexSpec()
+            : _details(0) , _finishedInit(false) {
+        }
+
+        explicit IndexSpec( const BSONObj& k , const BSONObj& m = BSONObj() )
+            : keyPattern(k) , info(m) , _details(0) , _finishedInit(false) {
+            _init();
+        }
+
+        /**
+           this is a DiscLoc of an IndexDetails info
+           should have a key field
+         */
+        explicit IndexSpec( const DiskLoc& loc ) {
+            reset( loc );
+        }
+
+        void reset( const BSONObj& info );
+        void reset( const DiskLoc& infoLoc ) { reset(infoLoc.obj()); }
+        void reset( const IndexDetails * details );
+
+        void getKeys( const BSONObj &obj, BSONObjSet &keys ) const;
+
+        BSONElement missingField() const { return _nullElt; }
+
+        string getTypeName() const {
+            if ( _indexType.get() )
+                return _indexType->getPlugin()->getName();
+            return "";
+        }
+
+        IndexType* getType() const {
+            return _indexType.get();
+        }
+
+        const IndexDetails * getDetails() const {
+            return _details;
+        }
+
+        IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const ;
+
+    protected:
+
+        int indexVersion() const;
+        
+        IndexSuitability _suitability( const BSONObj& query , const BSONObj& order ) const ;
+
+        BSONSizeTracker _sizeTracker;
+        vector<const char*> _fieldNames;
+        vector<BSONElement> _fixed;
+
+        BSONObj _nullKey; // a full key with all fields null
+        BSONObj _nullObj; // only used for _nullElt
+        BSONElement _nullElt; // jstNull
+
+        BSONObj _undefinedObj; // only used for _undefinedElt
+        BSONElement _undefinedElt; // undefined
+
+        int _nFields; // number of fields in the index
+        bool _sparse; // if the index is sparse
+        shared_ptr<IndexType> _indexType;
+        const IndexDetails * _details;
+
+        void _init();
+
+        friend class IndexType;
+        friend class KeyGeneratorV0;
+        friend class KeyGeneratorV1;
+    public:
+        bool _finishedInit;
+    };
+
+
+} // namespace mongo
diff --git a/src/mongo/db/instance.cpp b/src/mongo/db/instance.cpp
new file mode 100644
index 00000000000..c8f8c6ea85b
--- /dev/null
+++ b/src/mongo/db/instance.cpp
@@ -0,0 +1,1148 @@
+// instance.cpp 
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "db.h"
+#include "../bson/util/atomic_int.h"
+#include "introspect.h"
+#include "repl.h"
+#include "dbmessage.h"
+#include "instance.h"
+#include "lasterror.h"
+#include "security.h"
+#include "json.h"
+#include "replutil.h"
+#include "../s/d_logic.h"
+#include "../util/file_allocator.h"
+#include "../util/goodies.h"
+#include "cmdline.h"
+#if !defined(_WIN32)
+#include <sys/file.h>
+#endif
+#include "stats/counters.h"
+#include "background.h"
+#include "dur_journal.h"
+#include "dur_recover.h"
+#include "d_concurrency.h"
+#include "ops/count.h"
+#include "ops/delete.h"
+#include "ops/query.h"
+#include "ops/update.h"
+#include "pagefault.h"
+
+namespace mongo {
+
+    // "diaglog"
+    inline void opread(Message& m) { if( _diaglog.getLevel() & 2 ) _diaglog.readop((char *) m.singleData(), m.header()->len); }
+    inline void opwrite(Message& m) { if( _diaglog.getLevel() & 1 ) _diaglog.write((char *) m.singleData(), m.header()->len); }
+
+    void receivedKillCursors(Message& m);
+    void receivedUpdate(Message& m, CurOp& op);
+    void receivedDelete(Message& m, CurOp& op);
+    void receivedInsert(Message& m, CurOp& op);
+    bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop );
+
+    int nloggedsome = 0;
+#define LOGWITHRATELIMIT if( ++nloggedsome < 1000 || nloggedsome % 100 == 0 )
+
+    string dbExecCommand;
+
+    DiagLog _diaglog;
+
+    bool useCursors = true;
+    bool useHints = true;
+
+    KillCurrentOp killCurrentOp;
+
+    int lockFile = 0;
+#ifdef _WIN32
+    HANDLE lockFileHandle;
+#endif
+
+    // see FSyncCommand:
+    extern bool lockedForWriting;
+
+    OpTime OpTime::now() {
+        DEV d.dbMutex.assertWriteLocked();
+        return now_inlock();
+    }
+    OpTime OpTime::last_inlock(){
+        DEV d.dbMutex.assertAtLeastReadLocked();
+        return last;
+    }
+
+    // OpTime::now() uses dbMutex, thus it is in this file not in the cpp files used by drivers and such
+    void BSONElementManipulator::initTimestamp() {
+        massert( 10332 ,  "Expected CurrentTime type", _element.type() == Timestamp );
+        unsigned long long &timestamp = *( reinterpret_cast< unsigned long long* >( value() ) );
+        if ( timestamp == 0 )
+            timestamp = OpTime::now().asDate();
+    }
+    void BSONElementManipulator::SetNumber(double d) {
+        if ( _element.type() == NumberDouble )
+            *getDur().writing( reinterpret_cast< double * >( value() )  ) = d;
+        else if ( _element.type() == NumberInt )
+            *getDur().writing( reinterpret_cast< int * >( value() ) ) = (int) d;
+        else assert(0);
+    }
+    void BSONElementManipulator::SetLong(long long n) {
+        assert( _element.type() == NumberLong );
+        *getDur().writing( reinterpret_cast< long long * >(value()) ) = n;
+    }
+    void BSONElementManipulator::SetInt(int n) {
+        assert( _element.type() == NumberInt );
+        getDur().writingInt( *reinterpret_cast< int * >( value() ) ) = n;
+    }
+    /* dur:: version */
+    void BSONElementManipulator::ReplaceTypeAndValue( const BSONElement &e ) {
+        char *d = data();
+        char *v = value();
+        int valsize = e.valuesize();
+        int ofs = (int) (v-d);
+        dassert( ofs > 0 );
+        char *p = (char *) getDur().writingPtr(d, valsize + ofs);
+        *p = e.type();
+        memcpy( p + ofs, e.value(), valsize );
+    }
+
+    void inProgCmd( Message &m, DbResponse &dbresponse ) {
+        BSONObjBuilder b;
+
+        if( ! cc().isAdmin() ) {
+            b.append("err", "unauthorized");
+        }
+        else {
+            DbMessage d(m);
+            QueryMessage q(d);
+            bool all = q.query["$all"].trueValue();
+            vector<BSONObj> vals;
+            {
+                Client& me = cc();
+                scoped_lock bl(Client::clientsMutex);
+                auto_ptr<Matcher> m(new Matcher(q.query));
+                for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) {
+                    Client *c = *i;
+                    assert( c );
+                    CurOp* co = c->curop();
+                    if ( c == &me && !co ) {
+                        continue;
+                    }
+                    assert( co );
+                    if( all || co->active() ) {
+                        BSONObj info = co->infoNoauth();
+                        if ( all || m->matches( info )) {
+                            vals.push_back( info );
+                        }
+                    }
+                }
+            }
+            b.append("inprog", vals);
+            unsigned x = lockedForWriting;
+            if( x ) {
+                b.append("fsyncLock", x);
+                b.append("info", "use db.fsyncUnlock() to terminate the fsync write/snapshot lock");
+            }
+        }
+
+        replyToQuery(0, m, dbresponse, b.obj());
+    }
+
+    void killOp( Message &m, DbResponse &dbresponse ) {
+        BSONObj obj;
+        if( ! cc().isAdmin() ) {
+            obj = fromjson("{\"err\":\"unauthorized\"}");
+        }
+        /*else if( !dbMutexInfo.isLocked() )
+            obj = fromjson("{\"info\":\"no op in progress/not locked\"}");
+            */
+        else {
+            DbMessage d(m);
+            QueryMessage q(d);
+            BSONElement e = q.query.getField("op");
+            if( !e.isNumber() ) {
+                obj = fromjson("{\"err\":\"no op number field specified?\"}");
+            }
+            else {
+                log() << "going to kill op: " << e << endl;
+                obj = fromjson("{\"info\":\"attempting to kill op\"}");
+                killCurrentOp.kill( (unsigned) e.number() );
+            }
+        }
+        replyToQuery(0, m, dbresponse, obj);
+    }
+
+    void unlockFsyncAndWait();
+    void unlockFsync(const char *ns, Message& m, DbResponse &dbresponse) {
+        BSONObj obj;
+        if ( ! cc().isAdmin() ) { // checks auth
+            obj = fromjson("{\"err\":\"unauthorized\"}");
+        }
+        else if (strncmp(ns, "admin.", 6) != 0 ) {
+            obj = fromjson("{\"err\":\"unauthorized - this command must be run against the admin DB\"}");
+        }
+        else {
+            if( lockedForWriting ) {
+                log() << "command: unlock requested" << endl;
+                obj = fromjson("{ok:1,\"info\":\"unlock completed\"}");
+                unlockFsyncAndWait();
+            }
+            else {
+                obj = fromjson("{ok:0,\"errmsg\":\"not locked\"}");
+            }
+        }
+        replyToQuery(0, m, dbresponse, obj);
+    }
+
+    static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) {
+        bool ok = true;
+        MSGID responseTo = m.header()->id;
+
+        DbMessage d(m);
+        QueryMessage q(d);
+        auto_ptr< Message > resp( new Message() );
+
+        CurOp& op = *(c.curop());
+
+        shared_ptr<AssertionException> ex;
+
+        try {
+            dbresponse.exhaust = runQuery(m, q, op, *resp);
+            assert( !resp->empty() );
+        }
+        catch ( SendStaleConfigException& e ){
+            ex.reset( new SendStaleConfigException( e.getns(), e.getInfo().msg ) );
+            ok = false;
+        }
+        catch ( AssertionException& e ) {
+            ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) );
+            ok = false;
+        }
+
+        if( ex ){
+
+            op.debug().exceptionInfo = ex->getInfo();
+            LOGWITHRATELIMIT {
+                log() << "assertion " << ex->toString() << " ns:" << q.ns << " query:" <<
+                (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl;
+                if( q.ntoskip || q.ntoreturn )
+                    log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl;
+            }
+
+            SendStaleConfigException* scex = NULL;
+            if ( ex->getCode() == SendStaleConfigCode ) scex = static_cast<SendStaleConfigException*>( ex.get() );
+
+            BSONObjBuilder err;
+            ex->getInfo().append( err );
+            if( scex ) err.append( "ns", scex->getns() );
+            BSONObj errObj = err.done();
+
+            log() << errObj << endl;
+
+            BufBuilder b;
+            b.skip(sizeof(QueryResult));
+            b.appendBuf((void*) errObj.objdata(), errObj.objsize());
+
+            // todo: call replyToQuery() from here instead of this!!! see dbmessage.h
+            QueryResult * msgdata = (QueryResult *) b.buf();
+            b.decouple();
+            QueryResult *qr = msgdata;
+            qr->_resultFlags() = ResultFlag_ErrSet;
+            if( scex ) qr->_resultFlags() |= ResultFlag_ShardConfigStale;
+            qr->len = b.len();
+            qr->setOperation(opReply);
+            qr->cursorId = 0;
+            qr->startingFrom = 0;
+            qr->nReturned = 1;
+            resp.reset( new Message() );
+            resp->setData( msgdata, true );
+
+        }
+
+        op.debug().responseLength = resp->header()->dataLen();
+
+        dbresponse.response = resp.release();
+        dbresponse.responseTo = responseTo;
+
+        return ok;
+    }
+
+    void (*reportEventToSystem)(const char *msg) = 0;
+
+    void mongoAbort(const char *msg) { 
+        if( reportEventToSystem ) 
+            reportEventToSystem(msg);
+        rawOut(msg);
+        ::abort();
+    }
+
+    // Returns false when request includes 'end'
+    void _assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort& remote ) {
+
+        // before we lock...
+        int op = m.operation();
+        bool isCommand = false;
+        const char *ns = m.singleData()->_data + 4;
+        if ( op == dbQuery ) {
+            if( strstr(ns, ".$cmd") ) {
+                isCommand = true;
+                opwrite(m);
+                if( strstr(ns, ".$cmd.sys.") ) {
+                    if( strstr(ns, "$cmd.sys.inprog") ) {
+                        inProgCmd(m, dbresponse);
+                        return;
+                    }
+                    if( strstr(ns, "$cmd.sys.killop") ) {
+                        killOp(m, dbresponse);
+                        return;
+                    }
+                    if( strstr(ns, "$cmd.sys.unlock") ) {
+                        unlockFsync(ns, m, dbresponse);
+                        return;
+                    }
+                }
+            }
+            else {
+                opread(m);
+            }
+        }
+        else if( op == dbGetMore ) {
+            opread(m);
+        }
+        else {
+            opwrite(m);
+        }
+
+        globalOpCounters.gotOp( op , isCommand );
+
+        Client& c = cc();
+
+        auto_ptr<CurOp> nestedOp;
+        CurOp* currentOpP = c.curop();
+        if ( currentOpP->active() ) {
+            nestedOp.reset( new CurOp( &c , currentOpP ) );
+            currentOpP = nestedOp.get();
+        }
+        CurOp& currentOp = *currentOpP;
+        currentOp.reset(remote,op);
+
+        OpDebug& debug = currentOp.debug();
+        debug.op = op;
+
+        int logThreshold = cmdLine.slowMS;
+        bool log = logLevel >= 1;
+
+        if ( op == dbQuery ) {
+            if ( handlePossibleShardedMessage( m , &dbresponse ) )
+                return;
+            receivedQuery(c , dbresponse, m );
+        }
+        else if ( op == dbGetMore ) {
+            if ( ! receivedGetMore(dbresponse, m, currentOp) )
+                log = true;
+        }
+        else if ( op == dbMsg ) {
+            // deprecated - replaced by commands
+            char *p = m.singleData()->_data;
+            int len = strlen(p);
+            if ( len > 400 )
+                out() << curTimeMillis64() % 10000 <<
+                      " long msg received, len:" << len << endl;
+
+            Message *resp = new Message();
+            if ( strcmp( "end" , p ) == 0 )
+                resp->setData( opReply , "dbMsg end no longer supported" );
+            else
+                resp->setData( opReply , "i am fine - dbMsg deprecated");
+
+            dbresponse.response = resp;
+            dbresponse.responseTo = m.header()->id;
+        }
+        else {
+            const char *ns = m.singleData()->_data + 4;
+            char cl[256];
+            nsToDatabase(ns, cl);
+            if( ! c.getAuthenticationInfo()->isAuthorized(cl) ) {
+                uassert_nothrow("unauthorized");
+            }
+            else {
+                try {
+                    if ( op == dbInsert ) {
+                        receivedInsert(m, currentOp);
+                    }
+                    else if ( op == dbUpdate ) {
+                        receivedUpdate(m, currentOp);
+                    }
+                    else if ( op == dbDelete ) {
+                        receivedDelete(m, currentOp);
+                    }
+                    else if ( op == dbKillCursors ) {
+                        currentOp.ensureStarted();
+                        logThreshold = 10;
+                        receivedKillCursors(m);
+                    }
+                    else {
+                        mongo::log() << "    operation isn't supported: " << op << endl;
+                        currentOp.done();
+                        log = true;
+                    }
+                }
+                catch ( UserException& ue ) {
+                    tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << ue.toString() << endl;
+                    debug.exceptionInfo = ue.getInfo();
+                }
+                catch ( AssertionException& e ) {
+                    tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << e.toString() << endl;
+                    debug.exceptionInfo = e.getInfo();
+                    log = true;
+                }
+            }
+        }
+        currentOp.ensureStarted();
+        currentOp.done();
+        debug.executionTime = currentOp.totalTimeMillis();
+
+        //DEV log = true;
+        if ( log || debug.executionTime > logThreshold ) {
+            if( logLevel < 3 && op == dbGetMore && strstr(ns, ".oplog.") && debug.executionTime < 4300 && !log ) {
+                /* it's normal for getMore on the oplog to be slow because of use of awaitdata flag. */
+            }
+            else {
+                mongo::tlog() << debug << endl;
+            }
+        }
+
+        if ( currentOp.shouldDBProfile( debug.executionTime ) ) {
+            // performance profiling is on
+            if ( d.dbMutex.getState() < 0 ) {
+                mongo::log(1) << "note: not profiling because recursive read lock" << endl;
+            }
+            else {
+                writelock lk;
+                if ( dbHolder()._isLoaded( nsToDatabase( currentOp.getNS() ) , dbpath ) ) {
+                    Client::Context cx( currentOp.getNS() );
+                    profile(c , currentOp );
+                }
+                else {
+                    mongo::log() << "note: not profiling because db went away - probably a close on: " << currentOp.getNS() << endl;
+                }
+            }
+        }
+        
+        debug.reset();
+    } /* _assembleResponse() */
+
+    void assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort& remote ) {
+        PageFaultRetryableSection s;
+        while( 1 ) {
+            try {
+                _assembleResponse( m, dbresponse, remote );
+                break;
+            }
+            catch( PageFaultException& e ) { 
+                DEV log() << "TEMP PageFaultException touch and retry" << endl;
+                e.touch();
+            } 
+        }
+    }
+
+    void receivedKillCursors(Message& m) {
+        int *x = (int *) m.singleData()->_data;
+        x++; // reserved
+        int n = *x++;
+
+        uassert( 13659 , "sent 0 cursors to kill" , n != 0 );
+        massert( 13658 , str::stream() << "bad kill cursors size: " << m.dataSize() , m.dataSize() == 8 + ( 8 * n ) );
+        uassert( 13004 , str::stream() << "sent negative cursors to kill: " << n  , n >= 1 );
+
+        if ( n > 2000 ) {
+            log( n < 30000 ? LL_WARNING : LL_ERROR ) << "receivedKillCursors, n=" << n << endl;
+            assert( n < 30000 );
+        }
+
+        int found = ClientCursor::erase(n, (long long *) x);
+
+        if ( logLevel > 0 || found != n ) {
+            log( found == n ) << "killcursors: found " << found << " of " << n << endl;
+        }
+
+    }
+
+    /* db - database name
+       path - db directory
+    */
+    /*static*/ void Database::closeDatabase( const char *db, const string& path ) {
+        assertInWriteLock();
+
+        Client::Context * ctx = cc().getContext();
+        assert( ctx );
+        assert( ctx->inDB( db , path ) );
+        Database *database = ctx->db();
+        assert( database->name == db );
+
+        oplogCheckCloseDatabase( database ); // oplog caches some things, dirty its caches
+
+        if( BackgroundOperation::inProgForDb(db) ) {
+            log() << "warning: bg op in prog during close db? " << db << endl;
+        }
+
+        /* important: kill all open cursors on the database */
+        string prefix(db);
+        prefix += '.';
+        ClientCursor::invalidate(prefix.c_str());
+
+        NamespaceDetailsTransient::clearForPrefix( prefix.c_str() );
+
+        dbHolderW().erase( db, path );
+        ctx->_clear();
+        delete database; // closes files
+    }
+
+    void receivedUpdate(Message& m, CurOp& op) {
+        DbMessage d(m);
+        const char *ns = d.getns();
+        op.debug().ns = ns;
+        int flags = d.pullInt();
+        BSONObj query = d.nextJsObj();
+
+        assert( d.moreJSObjs() );
+        assert( query.objsize() < m.header()->dataLen() );
+        BSONObj toupdate = d.nextJsObj();
+        uassert( 10055 , "update object too large", toupdate.objsize() <= BSONObjMaxUserSize);
+        assert( toupdate.objsize() < m.header()->dataLen() );
+        assert( query.objsize() + toupdate.objsize() < m.header()->dataLen() );
+        bool upsert = flags & UpdateOption_Upsert;
+        bool multi = flags & UpdateOption_Multi;
+        bool broadcast = flags & UpdateOption_Broadcast;
+        
+        op.debug().query = query;
+        op.setQuery(query);
+
+        writelock lk;
+
+        // void ReplSetImpl::relinquish() uses big write lock so 
+        // this is thus synchronized given our lock above.
+        uassert( 10054 ,  "not master", isMasterNs( ns ) );
+        
+        // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit
+        if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) )
+            return;
+
+        Client::Context ctx( ns );
+
+        UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, true, op.debug() );
+        lastError.getSafe()->recordUpdate( res.existing , res.num , res.upserted ); // for getlasterror
+    }
+
+    void receivedDelete(Message& m, CurOp& op) {
+        DbMessage d(m);
+        const char *ns = d.getns();
+        op.debug().ns = ns;
+        int flags = d.pullInt();
+        bool justOne = flags & RemoveOption_JustOne;
+        bool broadcast = flags & RemoveOption_Broadcast;
+        assert( d.moreJSObjs() );
+        BSONObj pattern = d.nextJsObj();
+        
+        op.debug().query = pattern;
+        op.setQuery(pattern);
+
+        writelock lk(ns);
+
+        // writelock is used to synchronize stepdowns w/ writes
+        uassert( 10056 ,  "not master", isMasterNs( ns ) );
+
+        // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit
+        if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) )
+            return;
+
+        Client::Context ctx(ns);
+
+        long long n = deleteObjects(ns, pattern, justOne, true);
+        lastError.getSafe()->recordDelete( n );
+    }
+
+    QueryResult* emptyMoreResult(long long);
+
+    void OpTime::waitForDifferent(unsigned millis){
+        DEV d.dbMutex.assertAtLeastReadLocked();
+
+        if (*this != last) return; // check early
+
+        boost::xtime timeout;
+        boost::xtime_get(&timeout, boost::TIME_UTC);
+
+        timeout.nsec += millis * 1000*1000;
+        if (timeout.nsec >= 1000*1000*1000){
+            timeout.nsec -= 1000*1000*1000;
+            timeout.sec += 1;
+        }
+
+        do {
+            dbtemprelease tmp;
+            boost::mutex::scoped_lock lk(notifyMutex());
+            if (!notifier().timed_wait(lk, timeout))
+                return; // timed out
+        } while (*this != last);
+    }
+
+    bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) {
+        bool ok = true;
+
+        DbMessage d(m);
+
+        const char *ns = d.getns();
+        int ntoreturn = d.pullInt();
+        long long cursorid = d.pullInt64();
+
+        curop.debug().ns = ns;
+        curop.debug().ntoreturn = ntoreturn;
+        curop.debug().cursorid = cursorid;
+
+        time_t start = 0;
+        int pass = 0;
+        bool exhaust = false;
+        QueryResult* msgdata;
+        OpTime last;
+        while( 1 ) {
+            try {
+                Client::ReadContext ctx(ns);
+                if (str::startsWith(ns, "local.oplog.")){
+                    if (pass == 0)
+                        last = OpTime::last_inlock();
+                    else
+                        last.waitForDifferent(1000/*ms*/);
+                }
+                msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust);
+            }
+            catch ( AssertionException& e ) {
+                exhaust = false;
+                curop.debug().exceptionInfo = e.getInfo();
+                msgdata = emptyMoreResult(cursorid);
+                ok = false;
+            }
+            if (msgdata == 0) {
+                exhaust = false;
+                massert(13073, "shutting down", !inShutdown() );
+                if( pass == 0 ) {
+                    start = time(0);
+                }
+                else {
+                    if( time(0) - start >= 4 ) {
+                        // after about 4 seconds, return. pass stops at 1000 normally.
+                        // we want to return occasionally so slave can checkpoint.
+                        pass = 10000;
+                    }
+                }
+                pass++;
+                if (debug)
+                    sleepmillis(20);
+                else
+                    sleepmillis(2);
+                continue;
+            }
+            break;
+        };
+
+        Message *resp = new Message();
+        resp->setData(msgdata, true);
+        curop.debug().responseLength = resp->header()->dataLen();
+        curop.debug().nreturned = msgdata->nReturned;
+
+        dbresponse.response = resp;
+        dbresponse.responseTo = m.header()->id;
+        
+        if( exhaust ) {
+            curop.debug().exhaust = true;
+            dbresponse.exhaust = ns;
+        }
+
+        return ok;
+    }
+
+    void checkAndInsert(const char *ns, /*modifies*/BSONObj& js) { 
+        uassert( 10059 , "object to insert too large", js.objsize() <= BSONObjMaxUserSize);
+        {
+            // check no $ modifiers.  note we only check top level.  (scanning deep would be quite expensive)
+            BSONObjIterator i( js );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                uassert( 13511 , "document to insert can't have $ fields" , e.fieldName()[0] != '$' );
+            }
+        }
+        theDataFileMgr.insertWithObjMod(ns, js, false); // js may be modified in the call to add an _id field.
+        logOp("i", ns, js);
+    }
+
+    NOINLINE_DECL void insertMulti(bool keepGoing, const char *ns, vector<BSONObj>& objs) {
+        size_t i;
+        for (i=0; i<objs.size(); i++){
+            try {
+                checkAndInsert(ns, objs[i]);
+                getDur().commitIfNeeded();
+            } catch (const UserException&) {
+                if (!keepGoing || i == objs.size()-1){
+                    globalOpCounters.incInsertInWriteLock(i);
+                    throw;
+                }
+                // otherwise ignore and keep going
+            }
+        }
+
+        globalOpCounters.incInsertInWriteLock(i);
+    }
+
+    void receivedInsert(Message& m, CurOp& op) {
+        DbMessage d(m);
+        const char *ns = d.getns();
+        op.debug().ns = ns;
+
+        if( !d.moreJSObjs() ) {
+            // strange.  should we complain?
+            return;
+        }
+        BSONObj first = d.nextJsObj();
+
+        vector<BSONObj> multi;
+        while (d.moreJSObjs()){
+            if (multi.empty()) // first pass
+                multi.push_back(first);
+            multi.push_back( d.nextJsObj() );
+        }
+
+        writelock lk(ns);
+        //LockCollectionExclusively lk(ns);
+
+        // CONCURRENCY TODO: is being read locked in big log sufficient here?
+        // writelock is used to synchronize stepdowns w/ writes
+        uassert( 10058 , "not master", isMasterNs(ns) );
+
+        if ( handlePossibleShardedMessage( m , 0 ) )
+            return;
+
+        Client::Context ctx(ns);
+
+        if( !multi.empty() ) {
+            const bool keepGoing = d.reservedField() & InsertOption_ContinueOnError;
+            insertMulti(keepGoing, ns, multi);
+            return;
+        }
+
+        checkAndInsert(ns, first);
+        globalOpCounters.incInsertInWriteLock(1);
+    }
+
+    void getDatabaseNames( vector< string > &names , const string& usePath ) {
+        boost::filesystem::path path( usePath );
+        for ( boost::filesystem::directory_iterator i( path );
+                i != boost::filesystem::directory_iterator(); ++i ) {
+            if ( directoryperdb ) {
+                boost::filesystem::path p = *i;
+                string dbName = p.leaf();
+                p /= ( dbName + ".ns" );
+                if ( MMF::exists( p ) )
+                    names.push_back( dbName );
+            }
+            else {
+                string fileName = boost::filesystem::path(*i).leaf();
+                if ( fileName.length() > 3 && fileName.substr( fileName.length() - 3, 3 ) == ".ns" )
+                    names.push_back( fileName.substr( 0, fileName.length() - 3 ) );
+            }
+        }
+    }
+
+    /* returns true if there is data on this server.  useful when starting replication.
+       local database does NOT count except for rsoplog collection.
+       used to set the hasData field on replset heartbeat command response
+    */
+    bool replHasDatabases() {
+        vector<string> names;
+        getDatabaseNames(names);
+        if( names.size() >= 2 ) return true;
+        if( names.size() == 1 ) {
+            if( names[0] != "local" )
+                return true;
+            // we have a local database.  return true if oplog isn't empty
+            {
+                readlock lk(rsoplog);
+                BSONObj o;
+                if( Helpers::getFirst(rsoplog, o) )
+                    return true;
+            }
+        }
+        return false;
+    }
+
+    bool DBDirectClient::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) {
+        if ( lastError._get() )
+            lastError.startRequest( toSend, lastError._get() );
+        DbResponse dbResponse;
+        assembleResponse( toSend, dbResponse , _clientHost );
+        assert( dbResponse.response );
+        dbResponse.response->concat(); // can get rid of this if we make response handling smarter
+        response = *dbResponse.response;
+        getDur().commitIfNeeded();
+        return true;
+    }
+
+    void DBDirectClient::say( Message &toSend, bool isRetry ) {
+        if ( lastError._get() )
+            lastError.startRequest( toSend, lastError._get() );
+        DbResponse dbResponse;
+        assembleResponse( toSend, dbResponse , _clientHost );
+        getDur().commitIfNeeded();
+    }
+
+    auto_ptr<DBClientCursor> DBDirectClient::query(const string &ns, Query query, int nToReturn , int nToSkip ,
+            const BSONObj *fieldsToReturn , int queryOptions ) {
+
+        //if ( ! query.obj.isEmpty() || nToReturn != 0 || nToSkip != 0 || fieldsToReturn || queryOptions )
+        return DBClientBase::query( ns , query , nToReturn , nToSkip , fieldsToReturn , queryOptions );
+        //
+        //assert( query.obj.isEmpty() );
+        //throw UserException( (string)"yay:" + ns );
+    }
+
+    void DBDirectClient::killCursor( long long id ) {
+        ClientCursor::erase( id );
+    }
+
+    HostAndPort DBDirectClient::_clientHost = HostAndPort( "0.0.0.0" , 0 );
+
+    unsigned long long DBDirectClient::count(const string &ns, const BSONObj& query, int options, int limit, int skip ) {
+        LockCollectionForReading lk( ns );
+        string errmsg;
+        long long res = runCount( ns.c_str() , _countCmd( ns , query , options , limit , skip ) , errmsg );
+        if ( res == -1 )
+            return 0;
+        uassert( 13637 , str::stream() << "count failed in DBDirectClient: " << errmsg , res >= 0 );
+        return (unsigned long long )res;
+    }
+
+    DBClientBase * createDirectClient() {
+        return new DBDirectClient();
+    }
+
+    mongo::mutex exitMutex("exit");
+    AtomicUInt numExitCalls = 0;
+
+    bool inShutdown() {
+        return numExitCalls > 0;
+    }
+
+    void tryToOutputFatal( const string& s ) {
+        try {
+            rawOut( s );
+            return;
+        }
+        catch ( ... ) {}
+
+        try {
+            cerr << s << endl;
+            return;
+        }
+        catch ( ... ) {}
+
+        // uh - oh, not sure there is anything else we can do...
+    }
+
+    /** also called by ntservice.cpp */
+    void shutdownServer() {
+
+        log() << "shutdown: going to close listening sockets..." << endl;
+        ListeningSockets::get()->closeAll();
+
+        log() << "shutdown: going to flush diaglog..." << endl;
+        _diaglog.flush();
+
+        /* must do this before unmapping mem or you may get a seg fault */
+        log() << "shutdown: going to close sockets..." << endl;
+        boost::thread close_socket_thread( boost::bind(MessagingPort::closeAllSockets, 0) );
+
+        // wait until file preallocation finishes
+        // we would only hang here if the file_allocator code generates a
+        // synchronous signal, which we don't expect
+        log() << "shutdown: waiting for fs preallocator..." << endl;
+        FileAllocator::get()->waitUntilFinished();
+
+        if( cmdLine.dur ) {
+            log() << "shutdown: lock for final commit..." << endl;
+            {
+                int n = 10;
+                while( 1 ) {
+                    // we may already be in a read lock from earlier in the call stack, so do read lock here 
+                    // to be consistent with that.
+                    readlocktry w("", 20000);
+                    if( w.got() ) { 
+                        log() << "shutdown: final commit..." << endl;
+                        getDur().commitNow();
+                        break;
+                    }
+                    if( --n <= 0 ) {
+                        log() << "shutdown: couldn't acquire write lock, aborting" << endl;
+                        mongoAbort("couldn't acquire write lock");
+                    }
+                    log() << "shutdown: waiting for write lock..." << endl;
+                }
+            }
+            MemoryMappedFile::flushAll(true);
+        }
+
+        log() << "shutdown: closing all files..." << endl;
+        stringstream ss3;
+        MemoryMappedFile::closeAllFiles( ss3 );
+        log() << ss3.str() << endl;
+
+        if( cmdLine.dur ) {
+            dur::journalCleanup(true);
+        }
+
+#if !defined(__sunos__)
+        if ( lockFile ) {
+            log() << "shutdown: removing fs lock..." << endl;
+            /* This ought to be an unlink(), but Eliot says the last
+               time that was attempted, there was a race condition
+               with acquirePathLock().  */
+#ifdef _WIN32
+            if( _chsize( lockFile , 0 ) )
+                log() << "couldn't remove fs lock " << WSAGetLastError() << endl;
+            CloseHandle(lockFileHandle);
+#else
+            if( ftruncate( lockFile , 0 ) )
+                log() << "couldn't remove fs lock " << errnoWithDescription() << endl;
+            flock( lockFile, LOCK_UN );
+#endif
+        }
+#endif
+    }
+
+    void exitCleanly( ExitCode code ) {
+        killCurrentOp.killAll();
+        {
+            dblock lk;
+            log() << "now exiting" << endl;
+            dbexit( code );
+        }
+    }
+
+
+    namespace dur { 
+        extern mutex groupCommitMutex;
+    }
+
+    /* not using log() herein in case we are already locked */
+    NOINLINE_DECL void dbexit( ExitCode rc, const char *why, bool tryToGetLock ) {
+
+        auto_ptr<writelocktry> wlt;
+        if ( tryToGetLock ) {
+            wlt.reset( new writelocktry( "" , 2 * 60 * 1000 ) );
+            uassert( 13455 , "dbexit timed out getting lock" , wlt->got() );
+        }
+
+        Client * c = currentClient.get();
+        {
+            scoped_lock lk( exitMutex );
+            if ( numExitCalls++ > 0 ) {
+                if ( numExitCalls > 5 ) {
+                    // this means something horrible has happened
+                    ::_exit( rc );
+                }
+                stringstream ss;
+                ss << "dbexit: " << why << "; exiting immediately";
+                tryToOutputFatal( ss.str() );
+                if ( c ) c->shutdown();
+                ::exit( rc );
+            }
+        }
+
+        {
+            stringstream ss;
+            ss << "dbexit: " << why;
+            tryToOutputFatal( ss.str() );
+        }
+
+        try {
+            shutdownServer(); // gracefully shutdown instance
+        }
+        catch ( ... ) {
+            tryToOutputFatal( "shutdown failed with exception" );
+        }
+
+#if defined(_DEBUG)
+        try {
+            mutexDebugger.programEnding();
+        }
+        catch (...) { }
+#endif
+
+        // block the dur thread from doing any work for the rest of the run
+        log(2) << "shutdown: groupCommitMutex" << endl;
+        scoped_lock lk(dur::groupCommitMutex);
+
+#ifdef _WIN32
+        // Windows Service Controller wants to be told when we are down,
+        //  so don't call ::exit() yet, or say "really exiting now"
+        //
+        if ( rc == EXIT_WINDOWS_SERVICE_STOP ) {
+            if ( c ) c->shutdown();
+            return;
+        }
+#endif
+        tryToOutputFatal( "dbexit: really exiting now" );
+        if ( c ) c->shutdown();
+        ::exit(rc);
+    }
+
+#if !defined(__sunos__)
+    void writePid(int fd) {
+        stringstream ss;
+        ss << getpid() << endl;
+        string s = ss.str();
+        const char * data = s.c_str();
+#ifdef _WIN32
+        assert ( _write( fd, data, strlen( data ) ) );
+#else
+        assert ( write( fd, data, strlen( data ) ) );
+#endif
+    }
+
+    void acquirePathLock(bool doingRepair) {
+        string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string();
+
+        bool oldFile = false;
+
+        if ( boost::filesystem::exists( name ) && boost::filesystem::file_size( name ) > 0 ) {
+            oldFile = true;
+        }
+
+#ifdef _WIN32
+        lockFileHandle = CreateFileA( name.c_str(), GENERIC_READ | GENERIC_WRITE,
+            0 /* do not allow anyone else access */, NULL, 
+            OPEN_ALWAYS /* success if fh can open */, 0, NULL );
+
+        if (lockFileHandle == INVALID_HANDLE_VALUE) {
+            DWORD code = GetLastError();
+            char *msg;
+            FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM,
+                NULL, code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+                (LPSTR)&msg, 0, NULL);
+            string m = msg;
+            str::stripTrailing(m, "\r\n");
+            uasserted( 13627 , str::stream() << "Unable to create/open lock file: " << name << ' ' << m << ". Is a mongod instance already running?" );
+        }
+        lockFile = _open_osfhandle((intptr_t)lockFileHandle, 0);
+#else
+        lockFile = open( name.c_str(), O_RDWR | O_CREAT , S_IRWXU | S_IRWXG | S_IRWXO );
+        if( lockFile <= 0 ) {
+            uasserted( 10309 , str::stream() << "Unable to create/open lock file: " << name << ' ' << errnoWithDescription() << " Is a mongod instance already running?" );
+        }
+        if (flock( lockFile, LOCK_EX | LOCK_NB ) != 0) {
+            close ( lockFile );
+            lockFile = 0;
+            uassert( 10310 ,  "Unable to lock file: " + name + ". Is a mongod instance already running?",  0 );
+        }
+#endif
+
+        if ( oldFile ) {
+            // we check this here because we want to see if we can get the lock
+            // if we can't, then its probably just another mongod running
+            
+            string errmsg;
+            if (cmdLine.dur) {
+                if (!dur::haveJournalFiles()) {
+                    
+                    vector<string> dbnames;
+                    getDatabaseNames( dbnames );
+                    
+                    if ( dbnames.size() == 0 ) {
+                        // this means that mongod crashed
+                        // between initial startup and when journaling was initialized
+                        // it is safe to continue
+                    }
+                    else {
+                        errmsg = str::stream()
+                            << "************** \n"
+                            << "old lock file: " << name << ".  probably means unclean shutdown,\n"
+                            << "but there are no journal files to recover.\n"
+                            << "this is likely human error or filesystem corruption.\n"
+                            << "found " << dbnames.size() << " dbs.\n"
+                            << "see: http://dochub.mongodb.org/core/repair for more information\n"
+                            << "*************";
+                    }
+
+
+                }
+            }
+            else {
+                if (!dur::haveJournalFiles() && !doingRepair) {
+                    errmsg = str::stream()
+                             << "************** \n"
+                             << "Unclean shutdown detected.\n"
+                             << "Please visit http://dochub.mongodb.org/core/repair for recovery instructions.\n"
+                             << "*************";
+                }
+            }
+
+            if (!errmsg.empty()) {
+                cout << errmsg << endl;
+#ifdef _WIN32
+                CloseHandle( lockFileHandle );
+#else
+                close ( lockFile );
+#endif
+                lockFile = 0;
+                uassert( 12596 , "old lock file" , 0 );
+            }
+        }
+
+        // Not related to lock file, but this is where we handle unclean shutdown
+        if( !cmdLine.dur && dur::haveJournalFiles() ) {
+            cout << "**************" << endl;
+            cout << "Error: journal files are present in journal directory, yet starting without journaling enabled." << endl;
+            cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl;
+            cout << "**************" << endl;
+            uasserted(13597, "can't start without --journal enabled when journal/ files are present");
+        }
+
+#ifdef _WIN32
+        uassert( 13625, "Unable to truncate lock file", _chsize(lockFile, 0) == 0);
+        writePid( lockFile );
+        _commit( lockFile );
+#else
+        uassert( 13342, "Unable to truncate lock file", ftruncate(lockFile, 0) == 0);
+        writePid( lockFile );
+        fsync( lockFile );
+        flushMyDirectory(name);
+#endif
+    }
+#else
+    void acquirePathLock(bool) {
+        // TODO - this is very bad that the code above not running here.
+
+        // Not related to lock file, but this is where we handle unclean shutdown
+        if( !cmdLine.dur && dur::haveJournalFiles() ) {
+            cout << "**************" << endl;
+            cout << "Error: journal files are present in journal directory, yet starting without --journal enabled." << endl;
+            cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl;
+            cout << "Alternatively (not recommended), you can backup everything, then delete the journal files, and run --repair" << endl;
+            cout << "**************" << endl;
+            uasserted(13618, "can't start without --journal enabled when journal/ files are present");
+        }
+    }
+#endif
+
+} // namespace mongo
diff --git a/src/mongo/db/instance.h b/src/mongo/db/instance.h
new file mode 100644
index 00000000000..9dde729997d
--- /dev/null
+++ b/src/mongo/db/instance.h
@@ -0,0 +1,174 @@
+// instance.h : Global state functions.
+//
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+
+#include "../client/dbclient.h"
+#include "curop-inl.h"
+#include "security.h"
+#include "cmdline.h"
+#include "client.h"
+
+namespace mongo {
+
+    extern string dbExecCommand;
+
+    /** a high level recording of operations to the database - sometimes used for diagnostics 
+        and debugging.
+        */
+    class DiagLog {
+        ofstream *f; // note this is never freed
+        /* 0 = off; 1 = writes, 2 = reads, 3 = both
+           7 = log a few reads, and all writes.
+        */
+        int level;
+        mongo::mutex mutex;
+        void openFile() {
+            assert( f == 0 );
+            stringstream ss;
+            ss << dbpath << "/diaglog." << hex << time(0);
+            string name = ss.str();
+            f = new ofstream(name.c_str(), ios::out | ios::binary);
+            if ( ! f->good() ) {
+                problem() << "diagLogging couldn't open " << name << endl;
+                // todo what is this? :
+                throw 1717;
+            }
+            else {
+                log() << "diagLogging using file " << name << endl;
+            }
+        }
+    public:
+        DiagLog() : f(0) , level(0), mutex("DiagLog") { }
+        int getLevel() const { return level; }
+        /**
+         * @return old
+         */
+        int setLevel( int newLevel ) {
+            scoped_lock lk(mutex);
+            int old = level;
+            log() << "diagLogging level=" << newLevel << endl;
+            if( f == 0 ) { 
+                openFile();
+            }
+            level = newLevel; // must be done AFTER f is set
+            return old;
+        }
+        void flush() {
+            if ( level ) {
+                log() << "flushing diag log" << endl;
+                scoped_lock lk(mutex);
+                f->flush();
+            }
+        }
+        void write(char *data,int len) {
+            if ( level & 1 ) {
+                scoped_lock lk(mutex);
+                f->write(data,len);
+            }
+        }
+        void readop(char *data, int len) {
+            if ( level & 2 ) {
+                bool log = (level & 4) == 0;
+                OCCASIONALLY log = true;
+                if ( log ) {
+                    scoped_lock lk(mutex);
+                    assert( f );
+                    f->write(data,len);
+                }
+            }
+        }
+    };
+
+    extern DiagLog _diaglog;
+
+    /* we defer response until we unlock.  don't want a blocked socket to
+       keep things locked.
+    */
+    struct DbResponse {
+        Message *response;
+        MSGID responseTo;
+        const char *exhaust; /* points to ns if exhaust mode. 0=normal mode*/
+        DbResponse(Message *r, MSGID rt) : response(r), responseTo(rt), exhaust(0) { }
+        DbResponse() {
+            response = 0;
+            exhaust = 0;
+        }
+        ~DbResponse() { delete response; }
+    };
+
+    void assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort &client );
+
+    void getDatabaseNames( vector< string > &names , const string& usePath = dbpath );
+
+    /* returns true if there is no data on this server.  useful when starting replication.
+       local database does NOT count.
+    */
+    bool replHasDatabases();
+
+    /** "embedded" calls to the local server directly. 
+        Caller does not need to lock, that is handled within.
+     */
+    class DBDirectClient : public DBClientBase {
+    public:
+        virtual auto_ptr<DBClientCursor> query(const string &ns, Query query, int nToReturn = 0, int nToSkip = 0,
+                                               const BSONObj *fieldsToReturn = 0, int queryOptions = 0);
+
+        virtual bool isFailed() const {
+            return false;
+        }
+        virtual string toString() {
+            return "DBDirectClient";
+        }
+        virtual string getServerAddress() const {
+            return "localhost"; // TODO: should this have the port?
+        }
+        virtual bool call( Message &toSend, Message &response, bool assertOk=true , string * actualServer = 0 );
+        virtual void say( Message &toSend, bool isRetry = false );
+        virtual void sayPiggyBack( Message &toSend ) {
+            // don't need to piggy back when connected locally
+            return say( toSend );
+        }
+
+        virtual void killCursor( long long cursorID );
+
+        virtual bool callRead( Message& toSend , Message& response ) {
+            return call( toSend , response );
+        }
+        
+        virtual unsigned long long count(const string &ns, const BSONObj& query = BSONObj(), int options=0, int limit=0, int skip=0 );
+        
+        virtual ConnectionString::ConnectionType type() const { return ConnectionString::MASTER; }
+
+        double getSoTimeout() const { return 0; }
+
+        virtual bool lazySupported() const { return true; }
+    private:
+        static HostAndPort _clientHost;
+    };
+
+    extern int lockFile;
+#ifdef _WIN32
+    extern HANDLE lockFileHandle;
+#endif
+    void acquirePathLock(bool doingRepair=false); // if doingRepair=true don't consider unclean shutdown an error
+    void maybeCreatePidFile();
+
+} // namespace mongo
diff --git a/src/mongo/db/introspect.cpp b/src/mongo/db/introspect.cpp
new file mode 100644
index 00000000000..7e1d19ce2f3
--- /dev/null
+++ b/src/mongo/db/introspect.cpp
@@ -0,0 +1,88 @@
+// introspect.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "introspect.h"
+#include "../bson/util/builder.h"
+#include "../util/goodies.h"
+#include "pdfile.h"
+#include "jsobj.h"
+#include "pdfile.h"
+#include "curop.h"
+
+namespace mongo {
+
+    BufBuilder profileBufBuilder; // reused, instead of allocated every time - avoids a malloc/free cycle
+
+    void profile( const Client& c , CurOp& currentOp ) {
+        assertInWriteLock();
+
+        Database *db = c.database();
+        DEV assert( db );
+        const char *ns = db->profileName.c_str();
+        
+        // build object
+        profileBufBuilder.reset();
+        BSONObjBuilder b(profileBufBuilder);
+        b.appendDate("ts", jsTime());
+        currentOp.debug().append( currentOp , b );
+
+        b.append("client", c.clientAddress() );
+
+        if ( c.getAuthenticationInfo() )
+            b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) );
+
+        BSONObj p = b.done();
+
+        if (p.objsize() > 100*1024){
+            string small = p.toString(/*isArray*/false, /*full*/false);
+
+            warning() << "can't add full line to system.profile: " << small;
+
+            // rebuild with limited info
+            BSONObjBuilder b(profileBufBuilder);
+            b.appendDate("ts", jsTime());
+            b.append("client", c.clientAddress() );
+            if ( c.getAuthenticationInfo() )
+                b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) );
+
+            b.append("err", "profile line too large (max is 100KB)");
+            if (small.size() < 100*1024){ // should be much smaller but if not don't break anything
+                b.append("abbreviated", small);
+            }
+
+            p = b.done();
+        }
+
+        // write: not replicated
+        NamespaceDetails *d = db->namespaceIndex.details(ns);
+        if( d ) {
+            int len = p.objsize();
+            Record *r = theDataFileMgr.fast_oplog_insert(d, ns, len);
+            memcpy(getDur().writingPtr(r->data, len), p.objdata(), len);
+        }
+        else { 
+            static time_t last;
+            if( time(0) > last+10 ) {
+                log() << "profile: warning ns " << ns << " does not exist" << endl;
+                last = time(0);
+            }
+        }
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/introspect.h b/src/mongo/db/introspect.h
new file mode 100644
index 00000000000..209eeacab7c
--- /dev/null
+++ b/src/mongo/db/introspect.h
@@ -0,0 +1,34 @@
+// introspect.h
+// system management stuff.
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+#include "pdfile.h"
+
+namespace mongo {
+
+    /* --- profiling --------------------------------------------
+       do when database->profile is set
+    */
+
+    void profile( const Client& c , CurOp& currentOp );
+
+} // namespace mongo
diff --git a/src/mongo/db/javatest.cpp b/src/mongo/db/javatest.cpp
new file mode 100644
index 00000000000..22f2bdf8d3c
--- /dev/null
+++ b/src/mongo/db/javatest.cpp
@@ -0,0 +1,24 @@
+// javatest.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "javajs.h"
+
+int main() {
+    JavaJS = new JavaJSImpl();
+    javajstest();
+}
diff --git a/src/mongo/db/jsobj.cpp b/src/mongo/db/jsobj.cpp
new file mode 100644
index 00000000000..1e850982396
--- /dev/null
+++ b/src/mongo/db/jsobj.cpp
@@ -0,0 +1,1268 @@
+/** @file jsobj.cpp - BSON implementation
+    http://www.mongodb.org/display/DOCS/BSON
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "../bson/oid.h"
+#include "jsobj.h"
+#include "nonce.h"
+#include "../bson/util/atomic_int.h"
+#include "../util/base64.h"
+#include "../util/md5.hpp"
+#include <limits>
+#include <cmath>
+#include "../util/unittest.h"
+#include "../util/embedded_builder.h"
+#include "../util/stringutils.h"
+#include "../util/mongoutils/str.h"
+#include "json.h"
+#include "jsobjmanipulator.h"
+#include "../util/optime.h"
+#include <boost/static_assert.hpp>
+#undef assert
+#define assert MONGO_assert
+
+// make sure our assumptions are valid
+BOOST_STATIC_ASSERT( sizeof(short) == 2 );
+BOOST_STATIC_ASSERT( sizeof(int) == 4 );
+BOOST_STATIC_ASSERT( sizeof(long long) == 8 );
+BOOST_STATIC_ASSERT( sizeof(double) == 8 );
+BOOST_STATIC_ASSERT( sizeof(mongo::Date_t) == 8 );
+BOOST_STATIC_ASSERT( sizeof(mongo::OID) == 12 );
+
+namespace mongo {
+
+    BSONElement eooElement;
+
+    GENOIDLabeler GENOID;
+
+    DateNowLabeler DATENOW;
+    NullLabeler BSONNULL;
+
+    MinKeyLabeler MINKEY;
+    MaxKeyLabeler MAXKEY;
+
+    // need to move to bson/, but has dependency on base64 so move that to bson/util/ first.
+    inline string BSONElement::jsonString( JsonStringFormat format, bool includeFieldNames, int pretty ) const {
+        BSONType t = type();
+        int sign;
+        if ( t == Undefined )
+            return "undefined";
+
+        stringstream s;
+        if ( includeFieldNames )
+            s << '"' << escape( fieldName() ) << "\" : ";
+        switch ( type() ) {
+        case mongo::String:
+        case Symbol:
+            s << '"' << escape( string(valuestr(), valuestrsize()-1) ) << '"';
+            break;
+        case NumberLong:
+            s << _numberLong();
+            break;
+        case NumberInt:
+        case NumberDouble:
+            if ( number() >= -numeric_limits< double >::max() &&
+                    number() <= numeric_limits< double >::max() ) {
+                s.precision( 16 );
+                s << number();
+            }
+            else if ( mongo::isNaN(number()) ) {
+                s << "NaN";
+            }
+            else if ( mongo::isInf(number(), &sign) ) {
+                s << ( sign == 1 ? "Infinity" : "-Infinity");
+            }
+            else {
+                StringBuilder ss;
+                ss << "Number " << number() << " cannot be represented in JSON";
+                string message = ss.str();
+                massert( 10311 ,  message.c_str(), false );
+            }
+            break;
+        case mongo::Bool:
+            s << ( boolean() ? "true" : "false" );
+            break;
+        case jstNULL:
+            s << "null";
+            break;
+        case Object:
+            s << embeddedObject().jsonString( format, pretty );
+            break;
+        case mongo::Array: {
+            if ( embeddedObject().isEmpty() ) {
+                s << "[]";
+                break;
+            }
+            s << "[ ";
+            BSONObjIterator i( embeddedObject() );
+            BSONElement e = i.next();
+            if ( !e.eoo() ) {
+                int count = 0;
+                while ( 1 ) {
+                    if( pretty ) {
+                        s << '\n';
+                        for( int x = 0; x < pretty; x++ )
+                            s << "  ";
+                    }
+
+                    if (strtol(e.fieldName(), 0, 10) > count) {
+                        s << "undefined";
+                    }
+                    else {
+                        s << e.jsonString( format, false, pretty?pretty+1:0 );
+                        e = i.next();
+                    }
+                    count++;
+                    if ( e.eoo() )
+                        break;
+                    s << ", ";
+                }
+            }
+            s << " ]";
+            break;
+        }
+        case DBRef: {
+            mongo::OID *x = (mongo::OID *) (valuestr() + valuestrsize());
+            if ( format == TenGen )
+                s << "Dbref( ";
+            else
+                s << "{ \"$ref\" : ";
+            s << '"' << valuestr() << "\", ";
+            if ( format != TenGen )
+                s << "\"$id\" : ";
+            s << '"' << *x << "\" ";
+            if ( format == TenGen )
+                s << ')';
+            else
+                s << '}';
+            break;
+        }
+        case jstOID:
+            if ( format == TenGen ) {
+                s << "ObjectId( ";
+            }
+            else {
+                s << "{ \"$oid\" : ";
+            }
+            s << '"' << __oid() << '"';
+            if ( format == TenGen ) {
+                s << " )";
+            }
+            else {
+                s << " }";
+            }
+            break;
+        case BinData: {
+            int len = *(int *)( value() );
+            BinDataType type = BinDataType( *(char *)( (int *)( value() ) + 1 ) );
+            s << "{ \"$binary\" : \"";
+            char *start = ( char * )( value() ) + sizeof( int ) + 1;
+            base64::encode( s , start , len );
+            s << "\", \"$type\" : \"" << hex;
+            s.width( 2 );
+            s.fill( '0' );
+            s << type << dec;
+            s << "\" }";
+            break;
+        }
+        case mongo::Date:
+            if ( format == Strict )
+                s << "{ \"$date\" : ";
+            else
+                s << "Date( ";
+            if( pretty ) {
+                Date_t d = date();
+                if( d == 0 ) s << '0';
+                else
+                    s << '"' << date().toString() << '"';
+            }
+            else
+                s << date();
+            if ( format == Strict )
+                s << " }";
+            else
+                s << " )";
+            break;
+        case RegEx:
+            if ( format == Strict ) {
+                s << "{ \"$regex\" : \"" << escape( regex() );
+                s << "\", \"$options\" : \"" << regexFlags() << "\" }";
+            }
+            else {
+                s << "/" << escape( regex() , true ) << "/";
+                // FIXME Worry about alpha order?
+                for ( const char *f = regexFlags(); *f; ++f ) {
+                    switch ( *f ) {
+                    case 'g':
+                    case 'i':
+                    case 'm':
+                        s << *f;
+                    default:
+                        break;
+                    }
+                }
+            }
+            break;
+
+        case CodeWScope: {
+            BSONObj scope = codeWScopeObject();
+            if ( ! scope.isEmpty() ) {
+                s << "{ \"$code\" : " << _asCode() << " , "
+                  << " \"$scope\" : " << scope.jsonString() << " }";
+                break;
+            }
+        }
+
+        case Code:
+            s << _asCode();
+            break;
+
+        case Timestamp:
+            s << "{ \"t\" : " << timestampTime() << " , \"i\" : " << timestampInc() << " }";
+            break;
+
+        case MinKey:
+            s << "{ \"$minKey\" : 1 }";
+            break;
+
+        case MaxKey:
+            s << "{ \"$maxKey\" : 1 }";
+            break;
+
+        default:
+            StringBuilder ss;
+            ss << "Cannot create a properly formatted JSON string with "
+               << "element: " << toString() << " of type: " << type();
+            string message = ss.str();
+            massert( 10312 ,  message.c_str(), false );
+        }
+        return s.str();
+    }
+
+    int BSONElement::getGtLtOp( int def ) const {
+        const char *fn = fieldName();
+        if ( fn[0] == '$' && fn[1] ) {
+            if ( fn[2] == 't' ) {
+                if ( fn[1] == 'g' ) {
+                    if ( fn[3] == 0 ) return BSONObj::GT;
+                    else if ( fn[3] == 'e' && fn[4] == 0 ) return BSONObj::GTE;
+                }
+                else if ( fn[1] == 'l' ) {
+                    if ( fn[3] == 0 ) return BSONObj::LT;
+                    else if ( fn[3] == 'e' && fn[4] == 0 ) return BSONObj::LTE;
+                }
+            }
+            else if ( fn[1] == 'n' && fn[2] == 'e' ) {
+                if ( fn[3] == 0 )
+                    return BSONObj::NE;
+                if ( fn[3] == 'a' && fn[4] == 'r') // matches anything with $near prefix
+                    return BSONObj::opNEAR;
+            }
+            else if ( fn[1] == 'm' ) {
+                if ( fn[2] == 'o' && fn[3] == 'd' && fn[4] == 0 )
+                    return BSONObj::opMOD;
+                if ( fn[2] == 'a' && fn[3] == 'x' && fn[4] == 'D' && fn[5] == 'i' && fn[6] == 's' && fn[7] == 't' && fn[8] == 'a' && fn[9] == 'n' && fn[10] == 'c' && fn[11] == 'e' && fn[12] == 0 )
+                    return BSONObj::opMAX_DISTANCE;
+            }
+            else if ( fn[1] == 't' && fn[2] == 'y' && fn[3] == 'p' && fn[4] == 'e' && fn[5] == 0 )
+                return BSONObj::opTYPE;
+            else if ( fn[1] == 'i' && fn[2] == 'n' && fn[3] == 0 )
+                return BSONObj::opIN;
+            else if ( fn[1] == 'n' && fn[2] == 'i' && fn[3] == 'n' && fn[4] == 0 )
+                return BSONObj::NIN;
+            else if ( fn[1] == 'a' && fn[2] == 'l' && fn[3] == 'l' && fn[4] == 0 )
+                return BSONObj::opALL;
+            else if ( fn[1] == 's' && fn[2] == 'i' && fn[3] == 'z' && fn[4] == 'e' && fn[5] == 0 )
+                return BSONObj::opSIZE;
+            else if ( fn[1] == 'e' ) {
+                if ( fn[2] == 'x' && fn[3] == 'i' && fn[4] == 's' && fn[5] == 't' && fn[6] == 's' && fn[7] == 0 )
+                    return BSONObj::opEXISTS;
+                if ( fn[2] == 'l' && fn[3] == 'e' && fn[4] == 'm' && fn[5] == 'M' && fn[6] == 'a' && fn[7] == 't' && fn[8] == 'c' && fn[9] == 'h' && fn[10] == 0 )
+                    return BSONObj::opELEM_MATCH;
+            }
+            else if ( fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'g' && fn[4] == 'e' && fn[5] == 'x' && fn[6] == 0 )
+                return BSONObj::opREGEX;
+            else if ( fn[1] == 'o' && fn[2] == 'p' && fn[3] == 't' && fn[4] == 'i' && fn[5] == 'o' && fn[6] == 'n' && fn[7] == 's' && fn[8] == 0 )
+                return BSONObj::opOPTIONS;
+            else if ( fn[1] == 'w' && fn[2] == 'i' && fn[3] == 't' && fn[4] == 'h' && fn[5] == 'i' && fn[6] == 'n' && fn[7] == 0 )
+                return BSONObj::opWITHIN;
+        }
+        return def;
+    }
+
+    /* Matcher --------------------------------------*/
+
+// If the element is something like:
+//   a : { $gt : 3 }
+// we append
+//   a : 3
+// else we just append the element.
+//
+    void appendElementHandlingGtLt(BSONObjBuilder& b, const BSONElement& e) {
+        if ( e.type() == Object ) {
+            BSONElement fe = e.embeddedObject().firstElement();
+            const char *fn = fe.fieldName();
+            if ( fn[0] == '$' && fn[1] && fn[2] == 't' ) {
+                b.appendAs(fe, e.fieldName());
+                return;
+            }
+        }
+        b.append(e);
+    }
+
+    int getGtLtOp(const BSONElement& e) {
+        if ( e.type() != Object )
+            return BSONObj::Equality;
+
+        BSONElement fe = e.embeddedObject().firstElement();
+        return fe.getGtLtOp();
+    }
+
+    FieldCompareResult compareDottedFieldNames( const string& l , const string& r ) {
+        static int maxLoops = 1024 * 1024;
+
+        size_t lstart = 0;
+        size_t rstart = 0;
+
+        for ( int i=0; i<maxLoops; i++ ) {
+
+            size_t a = l.find( '.' , lstart );
+            size_t b = r.find( '.' , rstart );
+
+            size_t lend = a == string::npos ? l.size() : a;
+            size_t rend = b == string::npos ? r.size() : b;
+
+            const string& c = l.substr( lstart , lend - lstart );
+            const string& d = r.substr( rstart , rend - rstart );
+
+            int x = lexNumCmp( c.c_str(), d.c_str() );
+
+            if ( x < 0 )
+                return LEFT_BEFORE;
+            if ( x > 0 )
+                return RIGHT_BEFORE;
+
+            lstart = lend + 1;
+            rstart = rend + 1;
+
+            if ( lstart >= l.size() ) {
+                if ( rstart >= r.size() )
+                    return SAME;
+                return RIGHT_SUBFIELD;
+            }
+            if ( rstart >= r.size() )
+                return LEFT_SUBFIELD;
+        }
+
+        log() << "compareDottedFieldNames ERROR  l: " << l << " r: " << r << "  TOO MANY LOOPS" << endl;
+        assert(0);
+        return SAME; // will never get here
+    }
+
+    /* BSONObj ------------------------------------------------------------*/
+
+    string BSONObj::md5() const {
+        md5digest d;
+        md5_state_t st;
+        md5_init(&st);
+        md5_append( &st , (const md5_byte_t*)_objdata , objsize() );
+        md5_finish(&st, d);
+        return digestToString( d );
+    }
+
+    string BSONObj::jsonString( JsonStringFormat format, int pretty ) const {
+
+        if ( isEmpty() ) return "{}";
+
+        StringBuilder s;
+        s << "{ ";
+        BSONObjIterator i(*this);
+        BSONElement e = i.next();
+        if ( !e.eoo() )
+            while ( 1 ) {
+                s << e.jsonString( format, true, pretty?pretty+1:0 );
+                e = i.next();
+                if ( e.eoo() )
+                    break;
+                s << ",";
+                if ( pretty ) {
+                    s << '\n';
+                    for( int x = 0; x < pretty; x++ )
+                        s << "  ";
+                }
+                else {
+                    s << " ";
+                }
+            }
+        s << " }";
+        return s.str();
+    }
+
+    bool BSONObj::valid() const {
+        try {
+            BSONObjIterator it(*this);
+            while( it.moreWithEOO() ) {
+                // both throw exception on failure
+                BSONElement e = it.next(true);
+                e.validate();
+
+                if (e.eoo()) {
+                    if (it.moreWithEOO())
+                        return false;
+                    return true;
+                }
+                else if (e.isABSONObj()) {
+                    if(!e.embeddedObject().valid())
+                        return false;
+                }
+                else if (e.type() == CodeWScope) {
+                    if(!e.codeWScopeObject().valid())
+                        return false;
+                }
+            }
+        }
+        catch (...) {
+        }
+        return false;
+    }
+
+    int BSONObj::woCompare(const BSONObj& r, const Ordering &o, bool considerFieldName) const {
+        if ( isEmpty() )
+            return r.isEmpty() ? 0 : -1;
+        if ( r.isEmpty() )
+            return 1;
+
+        BSONObjIterator i(*this);
+        BSONObjIterator j(r);
+        unsigned mask = 1;
+        while ( 1 ) {
+            // so far, equal...
+
+            BSONElement l = i.next();
+            BSONElement r = j.next();
+            if ( l.eoo() )
+                return r.eoo() ? 0 : -1;
+            if ( r.eoo() )
+                return 1;
+
+            int x;
+            {
+                x = l.woCompare( r, considerFieldName );
+                if( o.descending(mask) )
+                    x = -x;
+            }
+            if ( x != 0 )
+                return x;
+            mask <<= 1;
+        }
+        return -1;
+    }
+
+    /* well ordered compare */
+    int BSONObj::woCompare(const BSONObj &r, const BSONObj &idxKey,
+                           bool considerFieldName) const {
+        if ( isEmpty() )
+            return r.isEmpty() ? 0 : -1;
+        if ( r.isEmpty() )
+            return 1;
+
+        bool ordered = !idxKey.isEmpty();
+
+        BSONObjIterator i(*this);
+        BSONObjIterator j(r);
+        BSONObjIterator k(idxKey);
+        while ( 1 ) {
+            // so far, equal...
+
+            BSONElement l = i.next();
+            BSONElement r = j.next();
+            BSONElement o;
+            if ( ordered )
+                o = k.next();
+            if ( l.eoo() )
+                return r.eoo() ? 0 : -1;
+            if ( r.eoo() )
+                return 1;
+
+            int x;
+            /*
+                        if( ordered && o.type() == String && strcmp(o.valuestr(), "ascii-proto") == 0 &&
+                            l.type() == String && r.type() == String ) {
+                            // note: no negative support yet, as this is just sort of a POC
+                            x = _stricmp(l.valuestr(), r.valuestr());
+                        }
+                        else*/ {
+                x = l.woCompare( r, considerFieldName );
+                if ( ordered && o.number() < 0 )
+                    x = -x;
+            }
+            if ( x != 0 )
+                return x;
+        }
+        return -1;
+    }
+
+    BSONObj staticNull = fromjson( "{'':null}" );
+    BSONObj makeUndefined() {
+        BSONObjBuilder b;
+        b.appendUndefined( "" );
+        return b.obj();
+    }
+    BSONObj staticUndefined = makeUndefined();
+
+    /* well ordered compare */
+    int BSONObj::woSortOrder(const BSONObj& other, const BSONObj& sortKey , bool useDotted ) const {
+        if ( isEmpty() )
+            return other.isEmpty() ? 0 : -1;
+        if ( other.isEmpty() )
+            return 1;
+
+        uassert( 10060 ,  "woSortOrder needs a non-empty sortKey" , ! sortKey.isEmpty() );
+
+        BSONObjIterator i(sortKey);
+        while ( 1 ) {
+            BSONElement f = i.next();
+            if ( f.eoo() )
+                return 0;
+
+            BSONElement l = useDotted ? getFieldDotted( f.fieldName() ) : getField( f.fieldName() );
+            if ( l.eoo() )
+                l = staticNull.firstElement();
+            BSONElement r = useDotted ? other.getFieldDotted( f.fieldName() ) : other.getField( f.fieldName() );
+            if ( r.eoo() )
+                r = staticNull.firstElement();
+
+            int x = l.woCompare( r, false );
+            if ( f.number() < 0 )
+                x = -x;
+            if ( x != 0 )
+                return x;
+        }
+        return -1;
+    }
+
+    template <typename BSONElementColl>
+    void _getFieldsDotted( const BSONObj* obj, const StringData& name, BSONElementColl &ret, bool expandLastArray ) {
+        BSONElement e = obj->getField( name );
+
+        if ( e.eoo() ) {
+            const char *p = strchr(name.data(), '.');
+            if ( p ) {
+                string left(name.data(), p-name.data());
+                const char* next = p+1;
+                BSONElement e = obj->getField( left.c_str() );
+
+                if (e.type() == Object) {
+                    e.embeddedObject().getFieldsDotted(next, ret, expandLastArray );
+                }
+                else if (e.type() == Array) {
+                    bool allDigits = false;
+                    if ( isdigit( *next ) ) {
+                        const char * temp = next + 1;
+                        while ( isdigit( *temp ) )
+                            temp++;
+                        allDigits = (*temp == '.' || *temp == '\0');
+                    }
+                    if (allDigits) {
+                        e.embeddedObject().getFieldsDotted(next, ret, expandLastArray );
+                    }
+                    else {
+                        BSONObjIterator i(e.embeddedObject());
+                        while ( i.more() ) {
+                            BSONElement e2 = i.next();
+                            if (e2.type() == Object || e2.type() == Array)
+                                e2.embeddedObject().getFieldsDotted(next, ret, expandLastArray );
+                        }
+                    }
+                }
+                else {
+                    // do nothing: no match
+                }
+            }
+        }
+        else {
+            if (e.type() == Array && expandLastArray) {
+                BSONObjIterator i(e.embeddedObject());
+                while ( i.more() )
+                    ret.insert(i.next());
+            }
+            else {
+                ret.insert(e);
+            }
+        }
+    }
+
+    void BSONObj::getFieldsDotted(const StringData& name, BSONElementSet &ret, bool expandLastArray ) const {
+        _getFieldsDotted( this, name, ret, expandLastArray );
+    }
+    void BSONObj::getFieldsDotted(const StringData& name, BSONElementMSet &ret, bool expandLastArray ) const {
+        _getFieldsDotted( this, name, ret, expandLastArray );
+    }
+
+    BSONElement BSONObj::getFieldDottedOrArray(const char *&name) const {
+        const char *p = strchr(name, '.');
+
+        BSONElement sub;
+
+        if ( p ) {
+            sub = getField( string(name, p-name) );
+            name = p + 1;
+        }
+        else {
+            sub = getField( name );
+            name = name + strlen(name);
+        }
+
+        if ( sub.eoo() )
+            return eooElement;
+        else if ( sub.type() == Array || name[0] == '\0' )
+            return sub;
+        else if ( sub.type() == Object )
+            return sub.embeddedObject().getFieldDottedOrArray( name );
+        else
+            return eooElement;
+    }
+
+    /**
+     sets element field names to empty string
+     If a field in pattern is missing, it is omitted from the returned
+     object.
+     */
+    BSONObj BSONObj::extractFieldsUnDotted(BSONObj pattern) const {
+        BSONObjBuilder b;
+        BSONObjIterator i(pattern);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            BSONElement x = getField(e.fieldName());
+            if ( !x.eoo() )
+                b.appendAs(x, "");
+        }
+        return b.obj();
+    }
+
+    BSONObj BSONObj::extractFields(const BSONObj& pattern , bool fillWithNull ) const {
+        BSONObjBuilder b(32); // scanandorder.h can make a zillion of these, so we start the allocation very small
+        BSONObjIterator i(pattern);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            BSONElement x = getFieldDotted(e.fieldName());
+            if ( ! x.eoo() )
+                b.appendAs( x, e.fieldName() );
+            else if ( fillWithNull )
+                b.appendNull( e.fieldName() );
+        }
+        return b.obj();
+    }
+
+    BSONObj BSONObj::filterFieldsUndotted( const BSONObj &filter, bool inFilter ) const {
+        BSONObjBuilder b;
+        BSONObjIterator i( *this );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            BSONElement x = filter.getField( e.fieldName() );
+            if ( ( x.eoo() && !inFilter ) ||
+                    ( !x.eoo() && inFilter ) )
+                b.append( e );
+        }
+        return b.obj();
+    }
+
+    BSONElement BSONObj::getFieldUsingIndexNames(const char *fieldName, const BSONObj &indexKey) const {
+        BSONObjIterator i( indexKey );
+        int j = 0;
+        while( i.moreWithEOO() ) {
+            BSONElement f = i.next();
+            if ( f.eoo() )
+                return BSONElement();
+            if ( strcmp( f.fieldName(), fieldName ) == 0 )
+                break;
+            ++j;
+        }
+        BSONObjIterator k( *this );
+        while( k.moreWithEOO() ) {
+            BSONElement g = k.next();
+            if ( g.eoo() )
+                return BSONElement();
+            if ( j == 0 ) {
+                return g;
+            }
+            --j;
+        }
+        return BSONElement();
+    }
+
+    /* grab names of all the fields in this object */
+    int BSONObj::getFieldNames(set<string>& fields) const {
+        int n = 0;
+        BSONObjIterator i(*this);
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            fields.insert(e.fieldName());
+            n++;
+        }
+        return n;
+    }
+
+    /* note: addFields always adds _id even if not specified
+       returns n added not counting _id unless requested.
+    */
+    int BSONObj::addFields(BSONObj& from, set<string>& fields) {
+        assert( isEmpty() && !isOwned() ); /* partial implementation for now... */
+
+        BSONObjBuilder b;
+
+        int N = fields.size();
+        int n = 0;
+        BSONObjIterator i(from);
+        bool gotId = false;
+        while ( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            const char *fname = e.fieldName();
+            if ( fields.count(fname) ) {
+                b.append(e);
+                ++n;
+                gotId = gotId || strcmp(fname, "_id")==0;
+                if ( n == N && gotId )
+                    break;
+            }
+            else if ( strcmp(fname, "_id")==0 ) {
+                b.append(e);
+                gotId = true;
+                if ( n == N && gotId )
+                    break;
+            }
+        }
+
+        if ( n ) {
+            *this = b.obj();
+        }
+
+        return n;
+    }
+
+    bool BSONObj::couldBeArray() const {
+        BSONObjIterator i( *this );
+        int index = 0;
+        while( i.moreWithEOO() ){
+            BSONElement e = i.next();
+            if( e.eoo() ) break;
+
+            // TODO:  If actually important, may be able to do int->char* much faster
+            if( strcmp( e.fieldName(), ((string)( mongoutils::str::stream() << index )).c_str() ) != 0 )
+                return false;
+            index++;
+        }
+        return true;
+    }
+
+    BSONObj BSONObj::clientReadable() const {
+        BSONObjBuilder b;
+        BSONObjIterator i( *this );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            switch( e.type() ) {
+            case MinKey: {
+                BSONObjBuilder m;
+                m.append( "$minElement", 1 );
+                b.append( e.fieldName(), m.done() );
+                break;
+            }
+            case MaxKey: {
+                BSONObjBuilder m;
+                m.append( "$maxElement", 1 );
+                b.append( e.fieldName(), m.done() );
+                break;
+            }
+            default:
+                b.append( e );
+            }
+        }
+        return b.obj();
+    }
+
+    BSONObj BSONObj::replaceFieldNames( const BSONObj &names ) const {
+        BSONObjBuilder b;
+        BSONObjIterator i( *this );
+        BSONObjIterator j( names );
+        BSONElement f = j.moreWithEOO() ? j.next() : BSONObj().firstElement();
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            if ( !f.eoo() ) {
+                b.appendAs( e, f.fieldName() );
+                f = j.next();
+            }
+            else {
+                b.append( e );
+            }
+        }
+        return b.obj();
+    }
+
+    bool BSONObj::okForStorage() const {
+        BSONObjIterator i( *this );
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            const char * name = e.fieldName();
+
+            if ( strchr( name , '.' ) ||
+                    strchr( name , '$' ) ) {
+                return
+                    strcmp( name , "$ref" ) == 0 ||
+                    strcmp( name , "$id" ) == 0
+                    ;
+            }
+
+            if ( e.mayEncapsulate() ) {
+                switch ( e.type() ) {
+                case Object:
+                case Array:
+                    if ( ! e.embeddedObject().okForStorage() )
+                        return false;
+                    break;
+                case CodeWScope:
+                    if ( ! e.codeWScopeObject().okForStorage() )
+                        return false;
+                    break;
+                default:
+                    uassert( 12579, "unhandled cases in BSONObj okForStorage" , 0 );
+                }
+
+            }
+        }
+        return true;
+    }
+
+    void BSONObj::dump() const {
+        out() << hex;
+        const char *p = objdata();
+        for ( int i = 0; i < objsize(); i++ ) {
+            out() << i << '\t' << ( 0xff & ( (unsigned) *p ) );
+            if ( *p >= 'A' && *p <= 'z' )
+                out() << '\t' << *p;
+            out() << endl;
+            p++;
+        }
+    }
+
+    void nested2dotted(BSONObjBuilder& b, const BSONObj& obj, const string& base) {
+        BSONObjIterator it(obj);
+        while (it.more()) {
+            BSONElement e = it.next();
+            if (e.type() == Object) {
+                string newbase = base + e.fieldName() + ".";
+                nested2dotted(b, e.embeddedObject(), newbase);
+            }
+            else {
+                string newbase = base + e.fieldName();
+                b.appendAs(e, newbase);
+            }
+        }
+    }
+
+    void dotted2nested(BSONObjBuilder& b, const BSONObj& obj) {
+        //use map to sort fields
+        BSONMap sorted = bson2map(obj);
+        EmbeddedBuilder eb(&b);
+        for(BSONMap::const_iterator it=sorted.begin(); it!=sorted.end(); ++it) {
+            eb.appendAs(it->second, it->first);
+        }
+        eb.done();
+    }
+
+    /*-- test things ----------------------------------------------------*/
+
+#pragma pack(1)
+    struct MaxKeyData {
+        MaxKeyData() {
+            totsize=7;
+            maxkey=MaxKey;
+            name=0;
+            eoo=EOO;
+        }
+        int totsize;
+        char maxkey;
+        char name;
+        char eoo;
+    } maxkeydata;
+    BSONObj maxKey((const char *) &maxkeydata);
+
+    struct MinKeyData {
+        MinKeyData() {
+            totsize=7;
+            minkey=MinKey;
+            name=0;
+            eoo=EOO;
+        }
+        int totsize;
+        char minkey;
+        char name;
+        char eoo;
+    } minkeydata;
+    BSONObj minKey((const char *) &minkeydata);
+
+    /*
+        struct JSObj0 {
+            JSObj0() {
+                totsize = 5;
+                eoo = EOO;
+            }
+            int totsize;
+            char eoo;
+        } js0;
+    */
+#pragma pack()
+
+    struct BsonUnitTest : public UnitTest {
+        void testRegex() {
+
+            BSONObjBuilder b;
+            b.appendRegex("x", "foo");
+            BSONObj o = b.done();
+
+            BSONObjBuilder c;
+            c.appendRegex("x", "goo");
+            BSONObj p = c.done();
+
+            assert( !o.binaryEqual( p ) );
+            assert( o.woCompare( p ) < 0 );
+
+        }
+        void testoid() {
+            OID id;
+            id.init();
+            //            sleepsecs(3);
+
+            OID b;
+            // goes with sleep above...
+            // b.init();
+            // assert( memcmp(id.getData(), b.getData(), 12) < 0 );
+
+            b.init( id.str() );
+            assert( b == id );
+        }
+
+        void testbounds() {
+            BSONObj l , r;
+            {
+                BSONObjBuilder b;
+                b.append( "x" , numeric_limits<long long>::max() );
+                l = b.obj();
+            }
+            {
+                BSONObjBuilder b;
+                b.append( "x" , numeric_limits<double>::max() );
+                r = b.obj();
+            }
+            assert( l.woCompare( r ) < 0 );
+            assert( r.woCompare( l ) > 0 );
+            {
+                BSONObjBuilder b;
+                b.append( "x" , numeric_limits<int>::max() );
+                l = b.obj();
+            }
+            assert( l.woCompare( r ) < 0 );
+            assert( r.woCompare( l ) > 0 );
+        }
+
+        void testorder() {
+            {
+                BSONObj x,y,z;
+                { BSONObjBuilder b; b.append( "x" , (long long)2 ); x = b.obj(); }
+                { BSONObjBuilder b; b.append( "x" , (int)3 ); y = b.obj(); }
+                { BSONObjBuilder b; b.append( "x" , (long long)4 ); z = b.obj(); }
+                assert( x.woCompare( y ) < 0 );
+                assert( x.woCompare( z ) < 0 );
+                assert( y.woCompare( x ) > 0 );
+                assert( z.woCompare( x ) > 0 );
+                assert( y.woCompare( z ) < 0 );
+                assert( z.woCompare( y ) > 0 );
+            }
+
+            {
+                BSONObj ll,d,i,n,u;
+                { BSONObjBuilder b; b.append( "x" , (long long)2 ); ll = b.obj(); }
+                { BSONObjBuilder b; b.append( "x" , (double)2 ); d = b.obj(); }
+                { BSONObjBuilder b; b.append( "x" , (int)2 ); i = b.obj(); }
+                { BSONObjBuilder b; b.appendNull( "x" ); n = b.obj(); }
+                { BSONObjBuilder b; u = b.obj(); }
+
+                assert( ll.woCompare( u ) == d.woCompare( u ) );
+                assert( ll.woCompare( u ) == i.woCompare( u ) );
+                BSONObj k = BSON( "x" << 1 );
+                assert( ll.woCompare( u , k ) == d.woCompare( u , k ) );
+                assert( ll.woCompare( u , k ) == i.woCompare( u , k ) );
+
+                assert( u.woCompare( ll ) == u.woCompare( d ) );
+                assert( u.woCompare( ll ) == u.woCompare( i ) );
+                assert( u.woCompare( ll , k ) == u.woCompare( d , k ) );
+                assert( u.woCompare( ll , k ) == u.woCompare( d , k ) );
+
+                assert( i.woCompare( n ) == d.woCompare( n ) );
+
+                assert( ll.woCompare( n ) == d.woCompare( n ) );
+                assert( ll.woCompare( n ) == i.woCompare( n ) );
+                assert( ll.woCompare( n , k ) == d.woCompare( n , k ) );
+                assert( ll.woCompare( n , k ) == i.woCompare( n , k ) );
+
+                assert( n.woCompare( ll ) == n.woCompare( d ) );
+                assert( n.woCompare( ll ) == n.woCompare( i ) );
+                assert( n.woCompare( ll , k ) == n.woCompare( d , k ) );
+                assert( n.woCompare( ll , k ) == n.woCompare( d , k ) );
+            }
+
+            {
+                BSONObj l,r;
+                { BSONObjBuilder b; b.append( "x" , "eliot" ); l = b.obj(); }
+                { BSONObjBuilder b; b.appendSymbol( "x" , "eliot" ); r = b.obj(); }
+                assert( l.woCompare( r ) == 0 );
+                assert( r.woCompare( l ) == 0 );
+            }
+        }
+
+        void run() {
+            testRegex();
+            BSONObjBuilder A,B,C;
+            A.append("x", 2);
+            B.append("x", 2.0);
+            C.append("x", 2.1);
+            BSONObj a = A.done();
+            BSONObj b = B.done();
+            BSONObj c = C.done();
+            assert( !a.binaryEqual( b ) ); // comments on operator==
+            int cmp = a.woCompare(b);
+            assert( cmp == 0 );
+            cmp = a.woCompare(c);
+            assert( cmp < 0 );
+            testoid();
+            testbounds();
+            testorder();
+        }
+    } bson_unittest;
+
+    Labeler::Label GT( "$gt" );
+    Labeler::Label GTE( "$gte" );
+    Labeler::Label LT( "$lt" );
+    Labeler::Label LTE( "$lte" );
+    Labeler::Label NE( "$ne" );
+    Labeler::Label SIZE( "$size" );
+
+    void BSONObjBuilder::appendMinForType( const StringData& fieldName , int t ) {
+        switch ( t ) {
+                
+        // Shared canonical types
+        case NumberInt:
+        case NumberDouble:
+        case NumberLong:
+            append( fieldName , - numeric_limits<double>::max() ); return;
+        case Symbol:
+        case String:
+            append( fieldName , "" ); return;
+        case Date: 
+            // min varies with V0 and V1 indexes, so we go one type lower.
+            appendBool(fieldName, true);
+            //appendDate( fieldName , numeric_limits<long long>::min() ); 
+            return;
+        case Timestamp: // TODO integrate with Date SERVER-3304
+            appendTimestamp( fieldName , 0 ); return;
+        case Undefined: // shared with EOO
+            appendUndefined( fieldName ); return;
+                
+        // Separate canonical types
+        case MinKey:
+            appendMinKey( fieldName ); return;
+        case MaxKey:
+            appendMaxKey( fieldName ); return;
+        case jstOID: {
+            OID o;
+            memset(&o, 0, sizeof(o));
+            appendOID( fieldName , &o);
+            return;
+        }
+        case Bool:
+            appendBool( fieldName , false); return;
+        case jstNULL:
+            appendNull( fieldName ); return;
+        case Object:
+            append( fieldName , BSONObj() ); return;
+        case Array:
+            appendArray( fieldName , BSONObj() ); return;
+        case BinData:
+            appendBinData( fieldName , 0 , BinDataGeneral , (const char *) 0 ); return;
+        case RegEx:
+            appendRegex( fieldName , "" ); return;
+        case DBRef: {
+            OID o;
+            memset(&o, 0, sizeof(o));
+            appendDBRef( fieldName , "" , o );
+            return;
+        }
+        case Code:
+            appendCode( fieldName , "" ); return;
+        case CodeWScope:
+            appendCodeWScope( fieldName , "" , BSONObj() ); return;
+        };
+        log() << "type not supported for appendMinElementForType: " << t << endl;
+        uassert( 10061 ,  "type not supported for appendMinElementForType" , false );
+    }
+
+    void BSONObjBuilder::appendMaxForType( const StringData& fieldName , int t ) {
+        switch ( t ) {
+                
+        // Shared canonical types
+        case NumberInt:
+        case NumberDouble:
+        case NumberLong:
+            append( fieldName , numeric_limits<double>::max() ); return;
+        case Symbol:
+        case String:
+            appendMinForType( fieldName, Object ); return;
+        case Date:
+            appendDate( fieldName , numeric_limits<long long>::max() ); return;
+        case Timestamp: // TODO integrate with Date SERVER-3304
+            appendTimestamp( fieldName , numeric_limits<unsigned long long>::max() ); return;
+        case Undefined: // shared with EOO
+            appendUndefined( fieldName ); return;
+
+        // Separate canonical types
+        case MinKey:
+            appendMinKey( fieldName ); return;
+        case MaxKey:
+            appendMaxKey( fieldName ); return;
+        case jstOID: {
+            OID o;
+            memset(&o, 0xFF, sizeof(o));
+            appendOID( fieldName , &o);
+            return;
+        }
+        case Bool:
+            appendBool( fieldName , true ); return;
+        case jstNULL:
+            appendNull( fieldName ); return;
+        case Object:
+            appendMinForType( fieldName, Array ); return;
+        case Array:
+            appendMinForType( fieldName, BinData ); return;
+        case BinData:
+            appendMinForType( fieldName, jstOID ); return;
+        case RegEx:
+            appendMinForType( fieldName, DBRef ); return;
+        case DBRef:
+            appendMinForType( fieldName, Code ); return;                
+        case Code:
+            appendMinForType( fieldName, CodeWScope ); return;
+        case CodeWScope:
+            // This upper bound may change if a new bson type is added.
+            appendMinForType( fieldName , MaxKey ); return;
+        }
+        log() << "type not supported for appendMaxElementForType: " << t << endl;
+        uassert( 14853 ,  "type not supported for appendMaxElementForType" , false );
+    }
+
+    int BSONElementFieldSorter( const void * a , const void * b ) {
+        const char * x = *((const char**)a);
+        const char * y = *((const char**)b);
+        x++; y++;
+        return lexNumCmp( x , y );
+    }
+
+    bool fieldsMatch(const BSONObj& lhs, const BSONObj& rhs) {
+        BSONObjIterator l(lhs);
+        BSONObjIterator r(rhs);
+
+        while (l.more() && r.more()){
+            if (strcmp(l.next().fieldName(), r.next().fieldName())) {
+                return false;
+            }
+        }
+
+        return !(l.more() || r.more()); // false if lhs and rhs have diff nFields()
+    }
+
+    BSONObjIteratorSorted::BSONObjIteratorSorted( const BSONObj& o ) {
+        _nfields = o.nFields();
+        _fields = new const char*[_nfields];
+        int x = 0;
+        BSONObjIterator i( o );
+        while ( i.more() ) {
+            _fields[x++] = i.next().rawdata();
+            assert( _fields[x-1] );
+        }
+        assert( x == _nfields );
+        qsort( _fields , _nfields , sizeof(char*) , BSONElementFieldSorter );
+        _cur = 0;
+    }
+
+    bool BSONObjBuilder::appendAsNumber( const StringData& fieldName , const string& data ) {
+        if ( data.size() == 0 || data == "-" || data == ".")
+            return false;
+
+        unsigned int pos=0;
+        if ( data[0] == '-' )
+            pos++;
+
+        bool hasDec = false;
+
+        for ( ; pos<data.size(); pos++ ) {
+            if ( isdigit(data[pos]) )
+                continue;
+
+            if ( data[pos] == '.' ) {
+                if ( hasDec )
+                    return false;
+                hasDec = true;
+                continue;
+            }
+
+            return false;
+        }
+
+        if ( hasDec ) {
+            double d = atof( data.c_str() );
+            append( fieldName , d );
+            return true;
+        }
+
+        if ( data.size() < 8 ) {
+            append( fieldName , atoi( data.c_str() ) );
+            return true;
+        }
+
+        try {
+            long long num = boost::lexical_cast<long long>( data );
+            append( fieldName , num );
+            return true;
+        }
+        catch(bad_lexical_cast &) {
+            return false;
+        }
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/jsobj.h b/src/mongo/db/jsobj.h
new file mode 100644
index 00000000000..ae039529fbf
--- /dev/null
+++ b/src/mongo/db/jsobj.h
@@ -0,0 +1,47 @@
+/** @file jsobj.h
+    BSON classes
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+/**
+   BSONObj and its helpers
+
+   "BSON" stands for "binary JSON" -- ie a binary way to represent objects that would be
+   represented in JSON (plus a few extensions useful for databases & other languages).
+
+   http://www.bsonspec.org/
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "../bson/util/builder.h"
+#include "../util/optime.h"
+//#include "boost/utility.hpp"
+//#include <set>
+#include "../bson/bsontypes.h"
+#include "../bson/oid.h"
+#include "../bson/bsonelement.h"
+#include "../bson/bsonobj.h"
+#include "../bson/bsonmisc.h"
+#include "../bson/bsonobjbuilder.h"
+#include "../bson/bsonobjiterator.h"
+#include "../bson/bson-inl.h"
+#include "../bson/ordering.h"
+#include "../bson/stringdata.h"
+#include "../bson/bson_db.h"
+
diff --git a/src/mongo/db/jsobjmanipulator.h b/src/mongo/db/jsobjmanipulator.h
new file mode 100644
index 00000000000..860e575940e
--- /dev/null
+++ b/src/mongo/db/jsobjmanipulator.h
@@ -0,0 +1,94 @@
+/** jsobjManipulator.h */
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+//#include "dur.h"
+
+namespace mongo {
+
+    /** Manipulate the binary representation of a BSONElement in-place.
+        Careful, this casts away const.
+    */
+    class BSONElementManipulator {
+    public:
+        BSONElementManipulator( const BSONElement &element ) :
+            _element( element ) {
+            assert( !_element.eoo() );
+        }
+        /** Replace a Timestamp type with a Date type initialized to
+            OpTime::now().asDate()
+        */
+        void initTimestamp();
+
+        // Note the ones with a capital letter call getDur().writing and journal
+
+        /** Change the value, in place, of the number. */
+        void setNumber(double d) {
+            if ( _element.type() == NumberDouble ) *reinterpret_cast< double * >( value() )  = d;
+            else if ( _element.type() == NumberInt ) *reinterpret_cast< int * >( value() ) = (int) d;
+            else assert(0);
+        }
+        void SetNumber(double d);
+        void setLong(long long n) {
+            assert( _element.type() == NumberLong );
+            *reinterpret_cast< long long * >( value() ) = n;
+        }
+        void SetLong(long long n);
+        void setInt(int n) {
+            assert( _element.type() == NumberInt );
+            *reinterpret_cast< int * >( value() ) = n;
+        }
+        void SetInt(int n);
+
+        /** Replace the type and value of the element with the type and value of e,
+            preserving the original fieldName */
+        void replaceTypeAndValue( const BSONElement &e ) {
+            *data() = e.type();
+            memcpy( value(), e.value(), e.valuesize() );
+        }
+
+        /* dur:: version */
+        void ReplaceTypeAndValue( const BSONElement &e );
+
+        static void lookForTimestamps( const BSONObj& obj ) {
+            // If have a Timestamp field as the first or second element,
+            // update it to a Date field set to OpTime::now().asDate().  The
+            // replacement policy is a work in progress.
+
+            BSONObjIterator i( obj );
+            for( int j = 0; i.moreWithEOO() && j < 2; ++j ) {
+                BSONElement e = i.next();
+                if ( e.eoo() )
+                    break;
+                if ( e.type() == Timestamp ) {
+                    BSONElementManipulator( e ).initTimestamp();
+                    break;
+                }
+            }
+        }
+    private:
+        char *data() { return nonConst( _element.rawdata() ); }
+        char *value() { return nonConst( _element.value() ); }
+        static char *nonConst( const char *s ) { return const_cast< char * >( s ); }
+
+        const BSONElement _element;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/json.cpp b/src/mongo/db/json.cpp
new file mode 100644
index 00000000000..73457a2bfbb
--- /dev/null
+++ b/src/mongo/db/json.cpp
@@ -0,0 +1,651 @@
+// json.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#define BOOST_SPIRIT_THREADSAFE
+#if BOOST_VERSION >= 103800
+#define BOOST_SPIRIT_USE_OLD_NAMESPACE
+#include <boost/spirit/include/classic_core.hpp>
+#include <boost/spirit/include/classic_loops.hpp>
+#include <boost/spirit/include/classic_lists.hpp>
+#else
+#include <boost/spirit/core.hpp>
+#include <boost/spirit/utility/loops.hpp>
+#include <boost/spirit/utility/lists.hpp>
+#endif
+#undef assert
+#define assert MONGO_assert
+
+#include "json.h"
+#include "../bson/util/builder.h"
+#include "../util/base64.h"
+#include "../util/hex.h"
+
+
+using namespace boost::spirit;
+
+namespace mongo {
+
+    struct ObjectBuilder : boost::noncopyable {
+        ~ObjectBuilder() {
+            unsigned i = builders.size();
+            if ( i ) {
+                i--;
+                for ( ; i>=1; i-- ) {
+                    if ( builders[i] ) {
+                        builders[i]->done();
+                    }
+                }
+            }
+        }
+        BSONObjBuilder *back() {
+            return builders.back().get();
+        }
+        // Storage for field names of elements within builders.back().
+        const char *fieldName() {
+            return fieldNames.back().c_str();
+        }
+        bool empty() const {
+            return builders.size() == 0;
+        }
+        void init() {
+            boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            builders.push_back( b );
+            fieldNames.push_back( "" );
+            indexes.push_back( 0 );
+        }
+        void pushObject( const char *fieldName ) {
+            boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder( builders.back()->subobjStart( fieldName ) ) );
+            builders.push_back( b );
+            fieldNames.push_back( "" );
+            indexes.push_back( 0 );
+        }
+        void pushArray( const char *fieldName ) {
+            boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder( builders.back()->subarrayStart( fieldName ) ) );
+            builders.push_back( b );
+            fieldNames.push_back( "" );
+            indexes.push_back( 0 );
+        }
+        BSONObj pop() {
+            BSONObj ret;
+            if ( back()->owned() )
+                ret = back()->obj();
+            else
+                ret = back()->done();
+            builders.pop_back();
+            fieldNames.pop_back();
+            indexes.pop_back();
+            return ret;
+        }
+        void nameFromIndex() {
+            fieldNames.back() = BSONObjBuilder::numStr( indexes.back() );
+        }
+        string popString() {
+            string ret = ss.str();
+            ss.str( "" );
+            return ret;
+        }
+        // Cannot use auto_ptr because its copy constructor takes a non const reference.
+        vector< boost::shared_ptr< BSONObjBuilder > > builders;
+        vector< string > fieldNames;
+        vector< int > indexes;
+        stringstream ss;
+        string ns;
+        OID oid;
+        string binData;
+        BinDataType binDataType;
+        string regex;
+        string regexOptions;
+        Date_t date;
+        OpTime timestamp;
+    };
+
+    struct objectStart {
+        objectStart( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char &c ) const {
+            if ( b.empty() )
+                b.init();
+            else
+                b.pushObject( b.fieldName() );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct arrayStart {
+        arrayStart( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char &c ) const {
+            b.pushArray( b.fieldName() );
+            b.nameFromIndex();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct arrayNext {
+        arrayNext( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char &c ) const {
+            ++b.indexes.back();
+            b.nameFromIndex();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct ch {
+        ch( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char c ) const {
+            b.ss << c;
+        }
+        ObjectBuilder &b;
+    };
+
+    struct chE {
+        chE( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char c ) const {
+            char o = '\0';
+            switch ( c ) {
+            case '\"':
+                o = '\"';
+                break;
+            case '\'':
+                o = '\'';
+                break;
+            case '\\':
+                o = '\\';
+                break;
+            case '/':
+                o = '/';
+                break;
+            case 'b':
+                o = '\b';
+                break;
+            case 'f':
+                o = '\f';
+                break;
+            case 'n':
+                o = '\n';
+                break;
+            case 'r':
+                o = '\r';
+                break;
+            case 't':
+                o = '\t';
+                break;
+            case 'v':
+                o = '\v';
+                break;
+            default:
+                assert( false );
+            }
+            b.ss << o;
+        }
+        ObjectBuilder &b;
+    };
+
+    struct chU {
+        chU( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            unsigned char first = fromHex( start );
+            unsigned char second = fromHex( start + 2 );
+            if ( first == 0 && second < 0x80 )
+                b.ss << second;
+            else if ( first < 0x08 ) {
+                b.ss << char( 0xc0 | ( ( first << 2 ) | ( second >> 6 ) ) );
+                b.ss << char( 0x80 | ( ~0xc0 & second ) );
+            }
+            else {
+                b.ss << char( 0xe0 | ( first >> 4 ) );
+                b.ss << char( 0x80 | ( ~0xc0 & ( ( first << 2 ) | ( second >> 6 ) ) ) );
+                b.ss << char( 0x80 | ( ~0xc0 & second ) );
+            }
+        }
+        ObjectBuilder &b;
+    };
+
+    struct chClear {
+        chClear( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char c ) const {
+            b.popString();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct fieldNameEnd {
+        fieldNameEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            string name = b.popString();
+            massert( 10338 ,  "Invalid use of reserved field name: " + name,
+                     name != "$oid" &&
+                     name != "$binary" &&
+                     name != "$type" &&
+                     name != "$date" &&
+                     name != "$timestamp" &&
+                     name != "$regex" &&
+                     name != "$options" );
+            b.fieldNames.back() = name;
+        }
+        ObjectBuilder &b;
+    };
+
+    struct unquotedFieldNameEnd {
+        unquotedFieldNameEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            string name( start, end );
+            b.fieldNames.back() = name;
+        }
+        ObjectBuilder &b;
+    };
+
+    struct stringEnd {
+        stringEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->append( b.fieldName(), b.popString() );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct numberValue {
+        numberValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            string raw(start);
+            double val;
+
+            // strtod isn't able to deal with NaN and inf in a portable way.
+            // Correspondingly, we perform the conversions explicitly.
+
+            if ( ! raw.compare(0, 3, "NaN" ) ) {
+                val = std::numeric_limits<double>::quiet_NaN();
+            } 
+            else if ( ! raw.compare(0, 8, "Infinity" ) ) {
+                val = std::numeric_limits<double>::infinity();
+            } 
+            else if ( ! raw.compare(0, 9, "-Infinity" ) ) {
+                val = -std::numeric_limits<double>::infinity();
+            }
+            else {
+                // We re-parse the numeric string here because spirit parsing of strings
+                // to doubles produces different results from strtod in some cases and
+                // we want to use strtod to ensure consistency with other string to
+                // double conversions in our code.
+
+                val = strtod( start, 0 );
+            }
+
+            b.back()->append( b.fieldName(), val );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct intValue {
+        intValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( long long num ) const {
+            if (num >= numeric_limits<int>::min() && num <= numeric_limits<int>::max())
+                b.back()->append( b.fieldName(), (int)num );
+            else
+                b.back()->append( b.fieldName(), num );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct subobjectEnd {
+        subobjectEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.pop();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct arrayEnd {
+        arrayEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.pop();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct trueValue {
+        trueValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendBool( b.fieldName(), true );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct falseValue {
+        falseValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendBool( b.fieldName(), false );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct nullValue {
+        nullValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendNull( b.fieldName() );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct undefinedValue {
+        undefinedValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendUndefined( b.fieldName() );
+        }
+        ObjectBuilder &b;
+    };
+    
+    struct dbrefNS {
+        dbrefNS( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.ns = b.popString();
+        }
+        ObjectBuilder &b;
+    };
+
+// NOTE s must be 24 characters.
+    OID stringToOid( const char *s ) {
+        OID oid;
+        char *oidP = (char *)( &oid );
+        for ( int i = 0; i < 12; ++i )
+            oidP[ i ] = fromHex( s + ( i * 2 ) );
+        return oid;
+    }
+
+    struct oidValue {
+        oidValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.oid = stringToOid( start );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct dbrefEnd {
+        dbrefEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendDBRef( b.fieldName(), b.ns, b.oid );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct oidEnd {
+        oidEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendOID( b.fieldName(), &b.oid );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct timestampEnd {
+        timestampEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendTimestamp( b.fieldName(), b.timestamp.asDate() );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct binDataBinary {
+        binDataBinary( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            massert( 10339 ,  "Badly formatted bindata", ( end - start ) % 4 == 0 );
+            string encoded( start, end );
+            b.binData = base64::decode( encoded );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct binDataType {
+        binDataType( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.binDataType = BinDataType( fromHex( start ) );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct binDataEnd {
+        binDataEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendBinData( b.fieldName(), b.binData.length(),
+                                     b.binDataType, b.binData.data() );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct timestampSecs {
+        timestampSecs( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( unsigned long long x) const {
+            b.timestamp = OpTime( (unsigned) (x/1000) , 0);
+        }
+        ObjectBuilder &b;
+    };
+
+    struct timestampInc {
+        timestampInc( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( unsigned x) const {
+            b.timestamp = OpTime(b.timestamp.getSecs(), x);
+        }
+        ObjectBuilder &b;
+    };
+
+    struct dateValue {
+        dateValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( Date_t v ) const {
+            b.date = v;
+        }
+        ObjectBuilder &b;
+    };
+
+    struct dateEnd {
+        dateEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendDate( b.fieldName(), b.date );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct regexValue {
+        regexValue( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.regex = b.popString();
+        }
+        ObjectBuilder &b;
+    };
+
+    struct regexOptions {
+        regexOptions( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.regexOptions = string( start, end );
+        }
+        ObjectBuilder &b;
+    };
+
+    struct regexEnd {
+        regexEnd( ObjectBuilder &_b ) : b( _b ) {}
+        void operator() ( const char *start, const char *end ) const {
+            b.back()->appendRegex( b.fieldName(), b.regex, b.regexOptions );
+        }
+        ObjectBuilder &b;
+    };
+
+// One gotcha with this parsing library is probably best ilustrated with an
+// example.  Say we have a production like this:
+// z = ( ch_p( 'a' )[ foo ] >> ch_p( 'b' ) ) | ( ch_p( 'a' )[ foo ] >> ch_p( 'c' ) );
+// On input "ac", action foo() will be called twice -- once as the parser tries
+// to match "ab", again as the parser successfully matches "ac".  Sometimes
+// the grammar can be modified to eliminate these situations.  Here, for example:
+// z = ch_p( 'a' )[ foo ] >> ( ch_p( 'b' ) | ch_p( 'c' ) );
+// However, this is not always possible.  In my implementation I've tried to
+// stick to the following pattern: store fields fed to action callbacks
+// temporarily as ObjectBuilder members, then append to a BSONObjBuilder once
+// the parser has completely matched a nonterminal and won't backtrack.  It's
+// worth noting here that this parser follows a short-circuit convention.  So,
+// in the original z example on line 3, if the input was "ab", foo() would only
+// be called once.
+    struct JsonGrammar : public grammar< JsonGrammar > {
+    public:
+        JsonGrammar( ObjectBuilder &_b ) : b( _b ) {}
+
+        template < typename ScannerT >
+        struct definition {
+            definition( JsonGrammar const &self ) {
+                object = ch_p( '{' )[ objectStart( self.b ) ] >> !members >> '}';
+                members = list_p((fieldName >> ':' >> value) , ',');
+                fieldName =
+                    str[ fieldNameEnd( self.b ) ] |
+                    singleQuoteStr[ fieldNameEnd( self.b ) ] |
+                    unquotedFieldName[ unquotedFieldNameEnd( self.b ) ];
+                array = ch_p( '[' )[ arrayStart( self.b ) ] >> !elements >> ']';
+                elements = list_p(value, ch_p(',')[arrayNext( self.b )]);
+                value =
+                    str[ stringEnd( self.b ) ] |
+                    number[ numberValue( self.b ) ] |
+                    integer |
+                    array[ arrayEnd( self.b ) ] |
+                    lexeme_d[ str_p( "true" ) ][ trueValue( self.b ) ] |
+                    lexeme_d[ str_p( "false" ) ][ falseValue( self.b ) ] |
+                    lexeme_d[ str_p( "null" ) ][ nullValue( self.b ) ] |
+                    lexeme_d[ str_p( "undefined" ) ][ undefinedValue( self.b ) ] |
+                    singleQuoteStr[ stringEnd( self.b ) ] |
+                    date[ dateEnd( self.b ) ] |
+                    oid[ oidEnd( self.b ) ] |
+                    bindata[ binDataEnd( self.b ) ] |
+                    dbref[ dbrefEnd( self.b ) ] |
+                    timestamp[ timestampEnd( self.b ) ] |
+                    regex[ regexEnd( self.b ) ] |
+                    object[ subobjectEnd( self.b ) ] ;
+                // NOTE lexeme_d and rules don't mix well, so we have this mess.
+                // NOTE We use range_p rather than cntrl_p, because the latter is locale dependent.
+                str = lexeme_d[ ch_p( '"' )[ chClear( self.b ) ] >>
+                                *( ( ch_p( '\\' ) >>
+                                     (
+                                         ch_p( 'b' )[ chE( self.b ) ] |
+                                         ch_p( 'f' )[ chE( self.b ) ] |
+                                         ch_p( 'n' )[ chE( self.b ) ] |
+                                         ch_p( 'r' )[ chE( self.b ) ] |
+                                         ch_p( 't' )[ chE( self.b ) ] |
+                                         ch_p( 'v' )[ chE( self.b ) ] |
+                                         ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) |
+                                         ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported
+                                     )
+                                   ) |
+                                   ( ~range_p( 0x00, 0x1f ) & ~ch_p( '"' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '"' ];
+
+                singleQuoteStr = lexeme_d[ ch_p( '\'' )[ chClear( self.b ) ] >>
+                                           *( ( ch_p( '\\' ) >>
+                                                (
+                                                    ch_p( 'b' )[ chE( self.b ) ] |
+                                                    ch_p( 'f' )[ chE( self.b ) ] |
+                                                    ch_p( 'n' )[ chE( self.b ) ] |
+                                                    ch_p( 'r' )[ chE( self.b ) ] |
+                                                    ch_p( 't' )[ chE( self.b ) ] |
+                                                    ch_p( 'v' )[ chE( self.b ) ] |
+                                                    ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) |
+                                                    ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported
+                                                )
+                                              ) |
+                                              ( ~range_p( 0x00, 0x1f ) & ~ch_p( '\'' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '\'' ];
+
+                // real_p accepts numbers with nonsignificant zero prefixes, which
+                // aren't allowed in JSON.  Oh well.
+                number = strict_real_p | str_p( "NaN" ) | str_p( "Infinity" ) | str_p( "-Infinity" );
+
+                static int_parser<long long, 10,  1, numeric_limits<long long>::digits10 + 1> long_long_p;
+                integer = long_long_p[ intValue(self.b) ];
+
+                // We allow a subset of valid js identifier names here.
+                unquotedFieldName = lexeme_d[ ( alpha_p | ch_p( '$' ) | ch_p( '_' ) ) >> *( ( alnum_p | ch_p( '$' ) | ch_p( '_'  )) ) ];
+
+                dbref = dbrefS | dbrefT;
+                dbrefS = ch_p( '{' ) >> "\"$ref\"" >> ':' >>
+                         str[ dbrefNS( self.b ) ] >> ',' >> "\"$id\"" >> ':' >> quotedOid >> '}';
+                dbrefT = str_p( "Dbref" ) >> '(' >> str[ dbrefNS( self.b ) ] >> ',' >>
+                         quotedOid >> ')';
+
+                timestamp = ch_p( '{' ) >> "\"$timestamp\"" >> ':' >> '{' >>
+                    "\"t\"" >> ':' >> uint_parser<unsigned long long, 10, 1, -1>()[ timestampSecs(self.b) ] >> ',' >>
+                    "\"i\"" >> ':' >> uint_parser<unsigned int, 10, 1, -1>()[ timestampInc(self.b) ] >> '}' >>'}';
+
+                oid = oidS | oidT;
+                oidS = ch_p( '{' ) >> "\"$oid\"" >> ':' >> quotedOid >> '}';
+                oidT = str_p( "ObjectId" ) >> '(' >> quotedOid >> ')';
+
+                quotedOid = lexeme_d[ '"' >> ( repeat_p( 24 )[ xdigit_p ] )[ oidValue( self.b ) ] >> '"' ];
+
+                bindata = ch_p( '{' ) >> "\"$binary\"" >> ':' >>
+                          lexeme_d[ '"' >> ( *( range_p( 'A', 'Z' ) | range_p( 'a', 'z' ) | range_p( '0', '9' ) | ch_p( '+' ) | ch_p( '/' ) ) >> *ch_p( '=' ) )[ binDataBinary( self.b ) ] >> '"' ] >> ',' >> "\"$type\"" >> ':' >>
+                          lexeme_d[ '"' >> ( repeat_p( 2 )[ xdigit_p ] )[ binDataType( self.b ) ] >> '"' ] >> '}';
+
+                // TODO: this will need to use a signed parser at some point
+                date = dateS | dateT;
+                dateS = ch_p( '{' ) >> "\"$date\"" >> ':' >> uint_parser< Date_t >()[ dateValue( self.b ) ] >> '}';
+                dateT = !str_p("new") >> str_p( "Date" ) >> '(' >> uint_parser< Date_t >()[ dateValue( self.b ) ] >> ')';
+
+                regex = regexS | regexT;
+                regexS = ch_p( '{' ) >> "\"$regex\"" >> ':' >> str[ regexValue( self.b ) ] >> ',' >> "\"$options\"" >> ':' >> lexeme_d[ '"' >> ( *( alpha_p ) )[ regexOptions( self.b ) ] >> '"' ] >> '}';
+                // FIXME Obviously it would be nice to unify this with str.
+                regexT = lexeme_d[ ch_p( '/' )[ chClear( self.b ) ] >>
+                                   *( ( ch_p( '\\' ) >>
+                                        ( ch_p( '"' )[ chE( self.b ) ] |
+                                          ch_p( '\\' )[ chE( self.b ) ] |
+                                          ch_p( '/' )[ chE( self.b ) ] |
+                                          ch_p( 'b' )[ chE( self.b ) ] |
+                                          ch_p( 'f' )[ chE( self.b ) ] |
+                                          ch_p( 'n' )[ chE( self.b ) ] |
+                                          ch_p( 'r' )[ chE( self.b ) ] |
+                                          ch_p( 't' )[ chE( self.b ) ] |
+                                          ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) ) ) |
+                                      ( ~range_p( 0x00, 0x1f ) & ~ch_p( '/' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> str_p( "/" )[ regexValue( self.b ) ]
+                                   >> ( *( ch_p( 'i' ) | ch_p( 'g' ) | ch_p( 'm' ) ) )[ regexOptions( self.b ) ] ];
+            }
+            rule< ScannerT > object, members, array, elements, value, str, number, integer,
+                  dbref, dbrefS, dbrefT, timestamp, timestampS, timestampT, oid, oidS, oidT, 
+                  bindata, date, dateS, dateT, regex, regexS, regexT, quotedOid, fieldName, 
+                  unquotedFieldName, singleQuoteStr;
+            const rule< ScannerT > &start() const {
+                return object;
+            }
+        };
+        ObjectBuilder &b;
+    };
+
+    BSONObj fromjson( const char *str , int* len) {
+        if ( str[0] == '\0' ) {
+            if (len) *len = 0;
+            return BSONObj();
+        }
+
+        ObjectBuilder b;
+        JsonGrammar parser( b );
+        parse_info<> result = parse( str, parser, space_p );
+        if (len) {
+            *len = result.stop - str;
+        }
+        else if ( !result.full ) {
+            int limit = strnlen(result.stop , 10);
+            if (limit == -1) limit = 10;
+            msgasserted(10340, "Failure parsing JSON string near: " + string( result.stop, limit ));
+        }
+        BSONObj ret = b.pop();
+        assert( b.empty() );
+        return ret;
+    }
+
+    BSONObj fromjson( const string &str ) {
+        return fromjson( str.c_str() );
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/json.h b/src/mongo/db/json.h
new file mode 100644
index 00000000000..68dae042574
--- /dev/null
+++ b/src/mongo/db/json.h
@@ -0,0 +1,41 @@
+/** @file json.h */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+    /** Create a BSONObj from a JSON <http://www.json.org> string.  In addition
+     to the JSON extensions extensions described here
+     <http://mongodb.onconfluence.com/display/DOCS/Mongo+Extended+JSON>,
+     this function accepts certain unquoted field names and allows single quotes
+     to optionally be used when specifying field names and string values instead
+     of double quotes.  JSON unicode escape sequences (of the form \uXXXX) are
+     converted to utf8.
+     \throws MsgAssertionException if parsing fails.  The message included with
+     this assertion includes a rough indication of where parsing failed.
+    */
+    BSONObj fromjson(const string &str);
+
+    /** len will be size of JSON object in text chars. */
+    BSONObj fromjson(const char *str, int* len=NULL);
+
+} // namespace mongo
diff --git a/src/mongo/db/key.cpp b/src/mongo/db/key.cpp
new file mode 100644
index 00000000000..47449986d21
--- /dev/null
+++ b/src/mongo/db/key.cpp
@@ -0,0 +1,678 @@
+// @file key.cpp
+
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "key.h"
+#include "../util/unittest.h"
+
+namespace mongo {
+
+    extern const Ordering nullOrdering = Ordering::make(BSONObj());
+
+    // KeyBson is for V0 (version #0) indexes
+
+    int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o);
+
+    // "old" = pre signed dates & such; i.e. btree V0
+    /* must be same canon type when called */
+    int oldCompareElementValues(const BSONElement& l, const BSONElement& r) {
+        dassert( l.canonicalType() == r.canonicalType() );
+        int f;
+        double x;
+
+        switch ( l.type() ) {
+        case EOO:
+        case Undefined: // EOO and Undefined are same canonicalType
+        case jstNULL:
+        case MaxKey:
+        case MinKey:
+            return 0;
+        case Bool:
+            return *l.value() - *r.value();
+        case Timestamp:
+        case Date:
+            // unsigned dates for old version
+            if ( l.date() < r.date() )
+                return -1;
+            return l.date() == r.date() ? 0 : 1;
+        case NumberLong:
+            if( r.type() == NumberLong ) {
+                long long L = l._numberLong();
+                long long R = r._numberLong();
+                if( L < R ) return -1;
+                if( L == R ) return 0;
+                return 1;
+            }
+            // else fall through
+        case NumberInt:
+        case NumberDouble: {
+            double left = l.number();
+            double right = r.number();
+            bool lNan = !( left <= numeric_limits< double >::max() &&
+                           left >= -numeric_limits< double >::max() );
+            bool rNan = !( right <= numeric_limits< double >::max() &&
+                           right >= -numeric_limits< double >::max() );
+            if ( lNan ) {
+                if ( rNan ) {
+                    return 0;
+                }
+                else {
+                    return -1;
+                }
+            }
+            else if ( rNan ) {
+                return 1;
+            }
+            x = left - right;
+            if ( x < 0 ) return -1;
+            return x == 0 ? 0 : 1;
+        }
+        case jstOID:
+            return memcmp(l.value(), r.value(), 12);
+        case Code:
+        case Symbol:
+        case String:
+            // nulls not allowed in the middle of strings in the old version
+            return strcmp(l.valuestr(), r.valuestr());
+        case Object:
+        case Array:
+            return oldCompare(l.embeddedObject(), r.embeddedObject(), nullOrdering);
+        case DBRef: {
+            int lsz = l.valuesize();
+            int rsz = r.valuesize();
+            if ( lsz - rsz != 0 ) return lsz - rsz;
+            return memcmp(l.value(), r.value(), lsz);
+        }
+        case BinData: {
+            int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte
+            int rsz = r.objsize();
+            if ( lsz - rsz != 0 ) return lsz - rsz;
+            return memcmp(l.value()+4, r.value()+4, lsz+1);
+        }
+        case RegEx: {
+            int c = strcmp(l.regex(), r.regex());
+            if ( c )
+                return c;
+            return strcmp(l.regexFlags(), r.regexFlags());
+        }
+        case CodeWScope : {
+            f = l.canonicalType() - r.canonicalType();
+            if ( f )
+                return f;
+            f = strcmp( l.codeWScopeCode() , r.codeWScopeCode() );
+            if ( f )
+                return f;
+            f = strcmp( l.codeWScopeScopeData() , r.codeWScopeScopeData() );
+            if ( f )
+                return f;
+            return 0;
+        }
+        default:
+            out() << "oldCompareElementValues: bad type " << (int) l.type() << endl;
+            assert(false);
+        }
+        return -1;
+    }
+
+    int oldElemCompare(const BSONElement&l , const BSONElement& r) { 
+        int lt = (int) l.canonicalType();
+        int rt = (int) r.canonicalType();
+        int x = lt - rt;
+        if( x )
+            return x;
+        return oldCompareElementValues(l, r);
+    }
+
+    // pre signed dates & such
+    int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o) {
+        BSONObjIterator i(l);
+        BSONObjIterator j(r);
+        unsigned mask = 1;
+        while ( 1 ) {
+            // so far, equal...
+
+            BSONElement l = i.next();
+            BSONElement r = j.next();
+            if ( l.eoo() )
+                return r.eoo() ? 0 : -1;
+            if ( r.eoo() )
+                return 1;
+
+            int x;
+            {
+                x = oldElemCompare(l, r);
+                if( o.descending(mask) )
+                    x = -x;
+            }
+            if ( x != 0 )
+                return x;
+            mask <<= 1;
+        }
+        return -1;
+    }
+
+    /* old style compares:
+       - dates are unsigned 
+       - strings no nulls
+    */
+    int KeyBson::woCompare(const KeyBson& r, const Ordering &o) const { 
+        return oldCompare(_o, r._o, o); 
+    }
+
+    // woEqual could be made faster than woCompare but this is for backward compatibility so not worth a big effort
+    bool KeyBson::woEqual(const KeyBson& r) const { 
+        return oldCompare(_o, r._o, nullOrdering) == 0;
+    }
+
+    // [ ][HASMORE][x][y][canontype_4bits]
+    enum CanonicalsEtc { 
+        cminkey=1,
+        cnull=2,
+        cdouble=4,
+        cstring=6,
+        cbindata=7,
+        coid=8,
+        cfalse=10,
+        ctrue=11,
+        cdate=12,
+        cmaxkey=14,
+        cCANONTYPEMASK = 0xf,
+        cY = 0x10,
+        cint = cY | cdouble,
+        cX = 0x20,
+        clong = cX | cdouble,
+        cHASMORE = 0x40,
+        cNOTUSED = 0x80 // but see IsBSON sentinel - this bit not usable without great care
+    };
+
+    // bindata bson type
+    const unsigned BinDataLenMask = 0xf0;  // lengths are powers of 2 of this value
+    const unsigned BinDataTypeMask = 0x0f; // 0-7 as you would expect, 8-15 are 128+value.  see BinDataType.
+    const int BinDataLenMax = 32;
+    const int BinDataLengthToCode[] = { 
+        0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 
+        0x80, -1/*9*/, 0x90/*10*/, -1/*11*/, 0xa0/*12*/, -1/*13*/, 0xb0/*14*/, -1/*15*/,
+        0xc0/*16*/, -1, -1, -1, 0xd0/*20*/, -1, -1, -1, 
+        0xe0/*24*/, -1, -1, -1, -1, -1, -1, -1, 
+        0xf0/*32*/ 
+    };
+    const int BinDataCodeToLength[] = { 
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 32
+    };
+
+    int binDataCodeToLength(int codeByte) { 
+        return BinDataCodeToLength[codeByte >> 4];
+    }
+
+    /** object cannot be represented in compact format.  so store in traditional bson format 
+        with a leading sentinel byte IsBSON to indicate it's in that format.
+
+        Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here 
+        so that we don't have to do an extra malloc.
+    */
+    void KeyV1Owned::traditional(const BSONObj& obj) { 
+        b.reset();
+        b.appendUChar(IsBSON);
+        b.appendBuf(obj.objdata(), obj.objsize());
+        _keyData = (const unsigned char *) b.buf();
+    }
+
+    KeyV1Owned::KeyV1Owned(const KeyV1& rhs) {
+        b.appendBuf( rhs.data(), rhs.dataSize() );
+        _keyData = (const unsigned char *) b.buf();
+        dassert( b.len() == dataSize() ); // check datasize method is correct
+        dassert( (*_keyData & cNOTUSED) == 0 );
+    }
+
+    // fromBSON to Key format
+    KeyV1Owned::KeyV1Owned(const BSONObj& obj) {
+        BSONObj::iterator i(obj);
+        unsigned char bits = 0;
+        while( 1 ) { 
+            BSONElement e = i.next();
+            if( i.more() )
+                bits |= cHASMORE;
+            switch( e.type() ) { 
+            case MinKey:
+                b.appendUChar(cminkey|bits);
+                break;
+            case jstNULL:
+                b.appendUChar(cnull|bits);
+                break;
+            case MaxKey:
+                b.appendUChar(cmaxkey|bits);
+                break;
+            case Bool:
+                b.appendUChar( (e.boolean()?ctrue:cfalse) | bits );
+                break;
+            case jstOID:
+                b.appendUChar(coid|bits);
+                b.appendBuf(&e.__oid(), sizeof(OID));
+                break;
+            case BinData:
+                {
+                    int t = e.binDataType();
+                    // 0-7 and 0x80 to 0x87 are supported by Key
+                    if( (t & 0x78) == 0 && t != ByteArrayDeprecated ) {
+                        int len;
+                        const char * d = e.binData(len);
+                        if( len <= BinDataLenMax ) {
+                            int code = BinDataLengthToCode[len];
+                            if( code >= 0 ) {
+                                if( t >= 128 )
+                                    t = (t-128) | 0x08;
+                                dassert( (code&t) == 0 );
+                                b.appendUChar( cbindata|bits );
+                                b.appendUChar( code | t );
+                                b.appendBuf(d, len);
+                                break;
+                            }
+                        }
+                    }
+                    traditional(obj);
+                    return;
+                }
+            case Date:
+                b.appendUChar(cdate|bits);
+                b.appendStruct(e.date());
+                break;
+            case String:
+                {
+                    b.appendUChar(cstring|bits);
+                    // note we do not store the terminating null, to save space.
+                    unsigned x = (unsigned) e.valuestrsize() - 1;
+                    if( x > 255 ) { 
+                        traditional(obj);
+                        return;
+                    }
+                    b.appendUChar(x);
+                    b.appendBuf(e.valuestr(), x);
+                    break;
+                }
+            case NumberInt:
+                b.appendUChar(cint|bits);
+                b.appendNum((double) e._numberInt());
+                break;
+            case NumberLong:
+                {
+                    long long n = e._numberLong();
+                    long long m = 2LL << 52;
+                    DEV {
+                        long long d = m-1;
+                        assert( ((long long) ((double) -d)) == -d );
+                    }
+                    if( n >= m || n <= -m ) {
+                        // can't represent exactly as a double
+                        traditional(obj);
+                        return;
+                    }
+                    b.appendUChar(clong|bits);
+                    b.appendNum((double) n);
+                    break;
+                }
+            case NumberDouble:
+                {
+                    double d = e._numberDouble();
+                    if( isNaN(d) ) {
+                        traditional(obj);
+                        return;
+                    }
+                    b.appendUChar(cdouble|bits);
+                    b.appendNum(d);
+                    break;
+                }
+            default:
+                // if other types involved, store as traditional BSON
+                traditional(obj);
+                return;
+            }
+            if( !i.more() )
+                break;
+            bits = 0;
+        }
+        _keyData = (const unsigned char *) b.buf();
+        dassert( b.len() == dataSize() ); // check datasize method is correct
+        dassert( (*_keyData & cNOTUSED) == 0 );
+    }
+
+    BSONObj KeyV1::toBson() const { 
+        assert( _keyData != 0 );
+        if( !isCompactFormat() )
+            return bson();
+
+        BSONObjBuilder b(512);
+        const unsigned char *p = _keyData;
+        while( 1 ) { 
+            unsigned bits = *p++;
+
+            switch( bits & 0x3f ) {
+                case cminkey: b.appendMinKey(""); break;
+                case cnull:   b.appendNull(""); break;
+                case cfalse:  b.appendBool("", false); break;
+                case ctrue:   b.appendBool("", true); break;
+                case cmaxkey: 
+                    b.appendMaxKey(""); 
+                    break;
+                case cstring:
+                    {
+                        unsigned sz = *p++;
+                        // we build the element ourself as we have to null terminate it
+                        BufBuilder &bb = b.bb();
+                        bb.appendNum((char) String);
+                        bb.appendUChar(0); // fieldname ""
+                        bb.appendNum(sz+1);
+                        bb.appendBuf(p, sz);
+                        bb.appendUChar(0); // null char at end of string
+                        p += sz;
+                        break;
+                    }
+                case coid:
+                    b.appendOID("", (OID *) p);
+                    p += sizeof(OID);
+                    break;
+                case cbindata:
+                    {
+                        int len = binDataCodeToLength(*p);
+                        int subtype = (*p) & BinDataTypeMask;
+                        if( subtype & 0x8 ) { 
+                            subtype = (subtype & 0x7) | 0x80;
+                        }
+                        b.appendBinData("", len, (BinDataType) subtype, ++p);
+                        p += len;
+                        break;
+                    }
+                case cdate:
+                    b.appendDate("", (Date_t&) *p);
+                    p += 8;
+                    break;
+                case cdouble:
+                    b.append("", (double&) *p);
+                    p += sizeof(double);
+                    break;
+                case cint:
+                    b.append("", (int) ((double&) *p));
+                    p += sizeof(double);
+                    break;
+                case clong:
+                    b.append("", (long long) ((double&) *p));
+                    p += sizeof(double);
+                    break;
+                default:
+                    assert(false);
+            }
+
+            if( (bits & cHASMORE) == 0 )
+                break;
+        }
+        return b.obj();
+    }
+
+    static int compare(const unsigned char *&l, const unsigned char *&r) { 
+        int lt = (*l & cCANONTYPEMASK);
+        int rt = (*r & cCANONTYPEMASK);
+        int x = lt - rt;
+        if( x ) 
+            return x;
+
+        l++; r++;
+
+        // same type
+        switch( lt ) { 
+        case cdouble:
+            {
+                double L = *((double *) l);
+                double R = *((double *) r);
+                if( L < R )
+                    return -1;
+                if( L != R )
+                    return 1;
+                l += 8; r += 8;
+                break;
+            }
+        case cstring:
+            {
+                int lsz = *l;
+                int rsz = *r;
+                int common = min(lsz, rsz);
+                l++; r++; // skip the size byte
+                // use memcmp as we (will) allow zeros in UTF8 strings
+                int res = memcmp(l, r, common);
+                if( res ) 
+                    return res;
+                // longer string is the greater one
+                int diff = lsz-rsz;
+                if( diff ) 
+                    return diff;
+                l += lsz; r += lsz;
+                break;
+            }
+        case cbindata:
+            {
+                int L = *l;
+                int R = *r;
+                int llen = binDataCodeToLength(L);
+                int diff = L-R; // checks length and subtype simultaneously
+                if( diff ) {
+                    // unfortunately nibbles are backwards to do subtype and len in one check (could bit swap...)
+                    int rlen = binDataCodeToLength(R);
+                    if( llen != rlen ) 
+                        return llen - rlen;
+                    return diff;
+                }
+                // same length, same type
+                l++; r++;
+                int res = memcmp(l, r, llen);
+                if( res ) 
+                    return res;
+                l += llen; r += llen;
+                break;
+            }
+        case cdate:
+            {
+                long long L = *((long long *) l);
+                long long R = *((long long *) r);
+                if( L < R )
+                    return -1;
+                if( L > R )
+                    return 1;
+                l += 8; r += 8;
+                break;
+            }
+        case coid:
+            {
+                int res = memcmp(l, r, sizeof(OID));
+                if( res ) 
+                    return res;
+                l += 12; r += 12;
+                break;
+            }
+        default:
+            // all the others are a match -- e.g. null == null
+            ;
+        }
+
+        return 0;
+    }
+
+    // at least one of this and right are traditional BSON format
+    int NOINLINE_DECL KeyV1::compareHybrid(const KeyV1& right, const Ordering& order) const { 
+        BSONObj L = toBson();
+        BSONObj R = right.toBson();
+        return L.woCompare(R, order, /*considerfieldname*/false);
+    }
+
+    int KeyV1::woCompare(const KeyV1& right, const Ordering &order) const {
+        const unsigned char *l = _keyData;
+        const unsigned char *r = right._keyData;
+
+        if( (*l|*r) == IsBSON ) // only can do this if cNOTUSED maintained
+            return compareHybrid(right, order);
+
+        unsigned mask = 1;
+        while( 1 ) { 
+            char lval = *l; 
+            char rval = *r;
+            {
+                int x = compare(l, r); // updates l and r pointers
+                if( x ) {
+                    if( order.descending(mask) )
+                        x = -x;
+                    return x;
+                }
+            }
+
+            {
+                int x = ((int)(lval & cHASMORE)) - ((int)(rval & cHASMORE));
+                if( x ) 
+                    return x;
+                if( (lval & cHASMORE) == 0 )
+                    break;
+            }
+
+            mask <<= 1;
+        }
+
+        return 0;
+    }
+
+    static unsigned sizes[] = {
+        0,
+        1, //cminkey=1,
+        1, //cnull=2,
+        0,
+        9, //cdouble=4,
+        0,
+        0, //cstring=6,
+        0,
+        13, //coid=8,
+        0,
+        1, //cfalse=10,
+        1, //ctrue=11,
+        9, //cdate=12,
+        0,
+        1, //cmaxkey=14,
+        0
+    };
+
+    inline unsigned sizeOfElement(const unsigned char *p) { 
+        unsigned type = *p & cCANONTYPEMASK;
+        unsigned sz = sizes[type];
+        if( sz == 0 ) {
+            if( type == cstring ) { 
+                sz = ((unsigned) p[1]) + 2;
+            }
+            else {
+                assert( type == cbindata );
+                sz = binDataCodeToLength(p[1]) + 2;
+            }
+        }
+        return sz;
+    }
+
+    int KeyV1::dataSize() const { 
+        const unsigned char *p = _keyData;
+        if( !isCompactFormat() ) {
+            return bson().objsize() + 1;
+        }
+
+        bool more;
+        do { 
+            unsigned z = sizeOfElement(p);
+            more = (*p & cHASMORE) != 0;
+            p += z;
+        } while( more );
+        return p - _keyData;
+    }
+
+    bool KeyV1::woEqual(const KeyV1& right) const {
+        const unsigned char *l = _keyData;
+        const unsigned char *r = right._keyData;
+
+        if( (*l|*r) == IsBSON ) {
+            return toBson().equal(right.toBson());
+        }
+
+        while( 1 ) { 
+            char lval = *l; 
+            char rval = *r;
+            if( (lval&(cCANONTYPEMASK|cHASMORE)) != (rval&(cCANONTYPEMASK|cHASMORE)) )
+                return false;
+            l++; r++;
+            switch( lval&cCANONTYPEMASK ) { 
+            case coid:
+                if( *((unsigned*) l) != *((unsigned*) r) )
+                    return false;
+                l += 4; r += 4;
+            case cdate:
+                if( *((unsigned long long *) l) != *((unsigned long long *) r) )
+                    return false;
+                l += 8; r += 8;
+                break;
+            case cdouble:
+                if( *((double *) l) != *((double *) r) )
+                    return false;
+                l += 8; r += 8;
+                break;
+            case cstring:
+                {
+                    if( *l != *r ) 
+                        return false; // not same length
+                    unsigned sz = ((unsigned) *l) + 1;
+                    if( memcmp(l, r, sz) )
+                        return false;
+                    l += sz; r += sz;
+                    break;
+                }
+            case cbindata:
+                {
+                    if( *l != *r )
+                        return false; // len or subtype mismatch
+                    int len = binDataCodeToLength(*l) + 1;
+                    if( memcmp(l, r, len) ) 
+                        return false;
+                    l += len; r += len;
+                    break;
+                }
+            case cminkey:
+            case cnull:
+            case cfalse:
+            case ctrue:
+            case cmaxkey:
+                break;
+            default:
+                assert(false);
+            }
+            if( (lval&cHASMORE) == 0 )
+                break;
+        }
+        return true;
+    }
+
+    struct CmpUnitTest : public UnitTest {
+        void run() {
+            char a[2];
+            char b[2];
+            a[0] = -3;
+            a[1] = 0;
+            b[0] = 3;
+            b[1] = 0;
+            assert( strcmp(a,b)>0 && memcmp(a,b,2)>0 );
+        }
+    } cunittest;
+
+}
diff --git a/src/mongo/db/key.h b/src/mongo/db/key.h
new file mode 100644
index 00000000000..9284cdc7422
--- /dev/null
+++ b/src/mongo/db/key.h
@@ -0,0 +1,115 @@
+// @file key.h class(es) representing individual keys in a btree
+
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+ 
+#include "jsobj.h"
+
+namespace mongo { 
+
+    /** Key class for precomputing a small format index key that is denser than a traditional BSONObj. 
+
+        KeyBson is a legacy wrapper implementation for old BSONObj style keys for v:0 indexes.
+
+        KeyV1 is the new implementation.
+    */
+    class KeyBson /* "KeyV0" */ { 
+    public:
+        KeyBson() { }
+        explicit KeyBson(const char *keyData) : _o(keyData) { }
+        explicit KeyBson(const BSONObj& obj) : _o(obj) { }
+        int woCompare(const KeyBson& r, const Ordering &o) const;
+        BSONObj toBson() const { return _o; }
+        string toString() const { return _o.toString(); }
+        int dataSize() const { return _o.objsize(); }
+        const char * data() const { return _o.objdata(); }
+        BSONElement _firstElement() const { return _o.firstElement(); }
+        bool isCompactFormat() const { return false; }
+        bool woEqual(const KeyBson& r) const;
+        void assign(const KeyBson& rhs) { *this = rhs; }
+    private:
+        BSONObj _o;
+    };
+
+    class KeyV1Owned;
+
+    // corresponding to BtreeData_V1
+    class KeyV1 { 
+        void operator=(const KeyV1&); // disallowed just to make people be careful as we don't own the buffer
+        KeyV1(const KeyV1Owned&);     // disallowed as this is not a great idea as KeyV1Owned likely will go out of scope
+    public:
+        KeyV1() { _keyData = 0; }
+        ~KeyV1() { DEV _keyData = (const unsigned char *) 1; }
+
+        KeyV1(const KeyV1& rhs) : _keyData(rhs._keyData) { 
+            dassert( _keyData > (const unsigned char *) 1 );
+        }
+
+        // explicit version of operator= to be safe
+        void assign(const KeyV1& rhs) { 
+            _keyData = rhs._keyData;
+        }
+
+        /** @param keyData can be a buffer containing data in either BSON format, OR in KeyV1 format. 
+                   when BSON, we are just a wrapper
+        */
+        explicit KeyV1(const char *keyData) : _keyData((unsigned char *) keyData) { }
+
+        int woCompare(const KeyV1& r, const Ordering &o) const;
+        bool woEqual(const KeyV1& r) const;
+        BSONObj toBson() const;
+        string toString() const { return toBson().toString(); }
+
+        /** get the key data we want to store in the btree bucket */
+        const char * data() const { return (const char *) _keyData; }
+
+        /** @return size of data() */
+        int dataSize() const;
+
+        /** only used by geo, which always has bson keys */
+        BSONElement _firstElement() const { return bson().firstElement(); }
+        bool isCompactFormat() const { return *_keyData != IsBSON; }
+    protected:
+        enum { IsBSON = 0xff };
+        const unsigned char *_keyData;
+        BSONObj bson() const {
+            dassert( !isCompactFormat() );
+            return BSONObj((const char *) _keyData+1);
+        }
+    private:
+        int compareHybrid(const KeyV1& right, const Ordering& order) const;
+    };
+
+    class KeyV1Owned : public KeyV1 { 
+        void operator=(const KeyV1Owned&);
+    public:
+        /** @obj a BSON object to be translated to KeyV1 format.  If the object isn't 
+                 representable in KeyV1 format (which happens, intentionally, at times)
+                 it will stay as bson herein.
+        */
+        KeyV1Owned(const BSONObj& obj);
+
+        /** makes a copy (memcpy's the whole thing) */
+        KeyV1Owned(const KeyV1& rhs);
+
+    private:
+        StackBufBuilder b;
+        void traditional(const BSONObj& obj); // store as traditional bson not as compact format
+    };
+
+};
diff --git a/src/mongo/db/lasterror.cpp b/src/mongo/db/lasterror.cpp
new file mode 100644
index 00000000000..4ed4dfb0571
--- /dev/null
+++ b/src/mongo/db/lasterror.cpp
@@ -0,0 +1,142 @@
+// lasterror.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+
+#include "../util/unittest.h"
+#include "../util/net/message.h"
+
+
+#include "lasterror.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+    LastError LastError::noError;
+    LastErrorHolder lastError;
+
+    bool isShell = false;
+    void raiseError(int code , const char *msg) {
+        LastError *le = lastError.get();
+        if ( le == 0 ) {
+            /* might be intentional (non-user thread) */
+            DEV {
+                static unsigned n;
+                if( ++n < 4 && !isShell ) log() << "dev: lastError==0 won't report:" << msg << endl;
+            }
+        }
+        else if ( le->disabled ) {
+            log() << "lastError disabled, can't report: " << code << ":" << msg << endl;
+        }
+        else {
+            le->raiseError(code, msg);
+        }
+    }
+
+    bool LastError::appendSelf( BSONObjBuilder &b , bool blankErr ) {
+        if ( !valid ) {
+            if ( blankErr )
+                b.appendNull( "err" );
+            b.append( "n", 0 );
+            return false;
+        }
+
+        if ( msg.empty() ) {
+            if ( blankErr ) {
+                b.appendNull( "err" );
+            }
+        }
+        else {
+            b.append( "err", msg );
+        }
+
+        if ( code )
+            b.append( "code" , code );
+        if ( updatedExisting != NotUpdate )
+            b.appendBool( "updatedExisting", updatedExisting == True );
+        if ( upsertedId.isSet() )
+            b.append( "upserted" , upsertedId );
+        if ( writebackId.isSet() ) {
+            b.append( "writeback" , writebackId );
+            b.append( "instanceIdent" , prettyHostName() ); // this can be any unique string
+        }
+        b.appendNumber( "n", nObjects );
+
+        return ! msg.empty();
+    }
+
+    LastErrorHolder::~LastErrorHolder() {
+    }
+
+
+    LastError * LastErrorHolder::disableForCommand() {
+        LastError *le = _get();
+        uassert(13649, "no operation yet", le);
+        le->disabled = true;
+        le->nPrev--; // caller is a command that shouldn't count as an operation
+        return le;
+    }
+
+    LastError * LastErrorHolder::get( bool create ) {
+        LastError *ret = _get( create );
+        if ( ret && !ret->disabled )
+            return ret;
+        return 0;
+    }
+
+    LastError * LastErrorHolder::_get( bool create ) {
+        LastError * le = _tl.get();
+        if ( ! le && create ) {
+            le = new LastError();
+            _tl.reset( le );
+        }
+        return le;
+    }
+
+    void LastErrorHolder::release() {
+        _tl.release();
+    }
+
+    /** ok to call more than once. */
+    void LastErrorHolder::initThread() {
+        if( ! _tl.get() ) 
+            _tl.reset( new LastError() );
+    }
+
+    void LastErrorHolder::reset( LastError * le ) {
+        _tl.reset( le );
+    }
+
+    void prepareErrForNewRequest( Message &m, LastError * err ) {
+        // a killCursors message shouldn't affect last error
+        assert( err );
+        if ( m.operation() == dbKillCursors ) {
+            err->disabled = true;
+        }
+        else {
+            err->disabled = false;
+            err->nPrev++;
+        }
+    }
+
+    LastError * LastErrorHolder::startRequest( Message& m , LastError * le ) {
+        assert( le );
+        prepareErrForNewRequest( m, le );
+        return le;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/lasterror.h b/src/mongo/db/lasterror.h
new file mode 100644
index 00000000000..86250e496a8
--- /dev/null
+++ b/src/mongo/db/lasterror.h
@@ -0,0 +1,146 @@
+// lasterror.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "../bson/oid.h"
+
+namespace mongo {
+    class BSONObjBuilder;
+    class Message;
+
+    struct LastError {
+        int code;
+        string msg;
+        enum UpdatedExistingType { NotUpdate, True, False } updatedExisting;
+        OID upsertedId;
+        OID writebackId;
+        long long nObjects;
+        int nPrev;
+        bool valid;
+        bool disabled;
+        void writeback( OID& oid ) {
+            reset( true );
+            writebackId = oid;
+        }
+        void raiseError(int _code , const char *_msg) {
+            reset( true );
+            code = _code;
+            msg = _msg;
+        }
+        void recordUpdate( bool _updateObjects , long long _nObjects , OID _upsertedId ) {
+            reset( true );
+            nObjects = _nObjects;
+            updatedExisting = _updateObjects ? True : False;
+            if ( _upsertedId.isSet() )
+                upsertedId = _upsertedId;
+
+        }
+        void recordDelete( long long nDeleted ) {
+            reset( true );
+            nObjects = nDeleted;
+        }
+        LastError() {
+            reset();
+        }
+        void reset( bool _valid = false ) {
+            code = 0;
+            msg.clear();
+            updatedExisting = NotUpdate;
+            nObjects = 0;
+            nPrev = 1;
+            valid = _valid;
+            disabled = false;
+            upsertedId.clear();
+            writebackId.clear();
+        }
+
+        /**
+         * @return if there is an err
+         */
+        bool appendSelf( BSONObjBuilder &b , bool blankErr = true );
+
+        struct Disabled : boost::noncopyable {
+            Disabled( LastError * le ) {
+                _le = le;
+                if ( _le ) {
+                    _prev = _le->disabled;
+                    _le->disabled = true;
+                }
+                else {
+                    _prev = false;
+                }
+            }
+
+            ~Disabled() {
+                if ( _le )
+                    _le->disabled = _prev;
+            }
+
+            LastError * _le;
+            bool _prev;
+        };
+
+        static LastError noError;
+    };
+
+    extern class LastErrorHolder {
+    public:
+        LastErrorHolder(){}
+        ~LastErrorHolder();
+
+        LastError * get( bool create = false );
+        LastError * getSafe() {
+            LastError * le = get(false);
+            if ( ! le ) {
+                error() << " no LastError!" << endl;
+                assert( le );
+            }
+            return le;
+        }
+
+        LastError * _get( bool create = false ); // may return a disabled LastError
+
+        void reset( LastError * le );
+
+        /** ok to call more than once. */
+        void initThread();
+
+        int getID();
+        
+        void release();
+
+        /** when db receives a message/request, call this */
+        LastError * startRequest( Message& m , LastError * connectionOwned );
+
+        void disconnect( int clientId );
+
+        // used to disable lastError reporting while processing a killCursors message
+        // disable causes get() to return 0.
+        LastError *disableForCommand(); // only call once per command invocation!
+    private:
+        boost::thread_specific_ptr<LastError> _tl;
+
+        struct Status {
+            time_t time;
+            LastError *lerr;
+        };
+    } lastError;
+
+    void raiseError(int code , const char *msg);
+
+} // namespace mongo
diff --git a/src/mongo/db/matcher.cpp b/src/mongo/db/matcher.cpp
new file mode 100755
index 00000000000..2631845a757
--- /dev/null
+++ b/src/mongo/db/matcher.cpp
@@ -0,0 +1,1128 @@
+// matcher.cpp
+
+/* Matcher is our boolean expression evaluator for "where" clauses */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "matcher.h"
+#include "../util/goodies.h"
+#include "../util/unittest.h"
+#include "diskloc.h"
+#include "../scripting/engine.h"
+#include "db.h"
+#include "queryutil.h"
+#include "client.h"
+
+#include "pdfile.h"
+
+namespace {
+    inline pcrecpp::RE_Options flags2options(const char* flags) {
+        pcrecpp::RE_Options options;
+        options.set_utf8(true);
+        while ( flags && *flags ) {
+            if ( *flags == 'i' )
+                options.set_caseless(true);
+            else if ( *flags == 'm' )
+                options.set_multiline(true);
+            else if ( *flags == 'x' )
+                options.set_extended(true);
+            else if ( *flags == 's' )
+                options.set_dotall(true);
+            flags++;
+        }
+        return options;
+    }
+}
+
+//#define DEBUGMATCHER(x) cout << x << endl;
+#define DEBUGMATCHER(x)
+
+namespace mongo {
+
+    extern BSONObj staticNull;
+
+    class Where {
+    public:
+        Where() {
+            jsScope = 0;
+            func = 0;
+        }
+        ~Where() {
+
+            if ( scope.get() ){
+                try {
+                    scope->execSetup( "_mongo.readOnly = false;" , "make not read only" );
+                }
+                catch( DBException& e ){
+                    warning() << "javascript scope cleanup interrupted" << causedBy( e ) << endl;
+                }
+            }
+
+            if ( jsScope ) {
+                delete jsScope;
+                jsScope = 0;
+            }
+            func = 0;
+        }
+
+        auto_ptr<Scope> scope;
+        ScriptingFunction func;
+        BSONObj *jsScope;
+
+        void setFunc(const char *code) {
+            massert( 10341 ,  "scope has to be created first!" , scope.get() );
+            func = scope->createFunction( code );
+        }
+
+    };
+
+    Matcher::~Matcher() {
+        delete _where;
+        _where = 0;
+    }
+
+    ElementMatcher::ElementMatcher( BSONElement e , int op, bool isNot )
+        : _toMatch( e ) , _compareOp( op ), _isNot( isNot ), _subMatcherOnPrimitives(false) {
+        if ( op == BSONObj::opMOD ) {
+            BSONObj o = e.embeddedObject();
+            _mod = o["0"].numberInt();
+            _modm = o["1"].numberInt();
+
+            uassert( 10073 ,  "mod can't be 0" , _mod );
+        }
+        else if ( op == BSONObj::opTYPE ) {
+            _type = (BSONType)(e.numberInt());
+        }
+        else if ( op == BSONObj::opELEM_MATCH ) {
+            BSONElement m = e;
+            uassert( 12517 , "$elemMatch needs an Object" , m.type() == Object );
+            BSONObj x = m.embeddedObject();
+            if ( x.firstElement().getGtLtOp() == 0 ) {
+                _subMatcher.reset( new Matcher( x ) );
+                _subMatcherOnPrimitives = false;
+            }
+            else {
+                // meant to act on primitives
+                _subMatcher.reset( new Matcher( BSON( "" << x ) ) );
+                _subMatcherOnPrimitives = true;
+            }
+        }
+    }
+
+    ElementMatcher::ElementMatcher( BSONElement e , int op , const BSONObj& array, bool isNot )
+        : _toMatch( e ) , _compareOp( op ), _isNot( isNot ), _subMatcherOnPrimitives(false) {
+
+        _myset.reset( new set<BSONElement,element_lt>() );
+
+        BSONObjIterator i( array );
+        while ( i.more() ) {
+            BSONElement ie = i.next();
+            if ( op == BSONObj::opALL && ie.type() == Object && ie.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) {
+                shared_ptr<Matcher> s;
+                s.reset( new Matcher( ie.embeddedObject().firstElement().embeddedObjectUserCheck() ) );
+                _allMatchers.push_back( s );
+            }
+            else if ( ie.type() == RegEx ) {
+                if ( !_myregex.get() ) {
+                    _myregex.reset( new vector< RegexMatcher >() );
+                }
+                _myregex->push_back( RegexMatcher() );
+                RegexMatcher &rm = _myregex->back();
+                rm._re.reset( new pcrecpp::RE( ie.regex(), flags2options( ie.regexFlags() ) ) );
+                rm._fieldName = 0; // no need for field name
+                rm._regex = ie.regex();
+                rm._flags = ie.regexFlags();
+                rm._isNot = false;
+                bool purePrefix;
+                string prefix = simpleRegex(rm._regex, rm._flags, &purePrefix);
+                if (purePrefix)
+                    rm._prefix = prefix;
+            }
+            else {
+                uassert( 15882, "$elemMatch not allowed within $in",
+                         ie.type() != Object ||
+                         ie.embeddedObject().firstElement().getGtLtOp() != BSONObj::opELEM_MATCH );
+                _myset->insert(ie);
+            }
+        }
+
+        if ( _allMatchers.size() ) {
+            uassert( 13020 , "with $all, can't mix $elemMatch and others" , _myset->size() == 0 && !_myregex.get());
+        }
+
+    }
+
+    int ElementMatcher::inverseOfNegativeCompareOp() const {
+        verify( 15892, negativeCompareOp() );
+        return _compareOp == BSONObj::NE ? BSONObj::Equality : BSONObj::opIN;
+    }
+
+    bool ElementMatcher::negativeCompareOpContainsNull() const {
+        verify( 15893, negativeCompareOp() );
+        return (_compareOp == BSONObj::NE && _toMatch.type() != jstNULL) ||
+        (_compareOp == BSONObj::NIN && _myset->count( staticNull.firstElement()) == 0 );
+    }
+
+    void Matcher::addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot) {
+
+        RegexMatcher rm;
+        rm._re.reset( new pcrecpp::RE(regex, flags2options(flags)) );
+        rm._fieldName = fieldName;
+        rm._regex = regex;
+        rm._flags = flags;
+        rm._isNot = isNot;
+        _regexs.push_back(rm);
+
+        if (!isNot) { //TODO something smarter
+            bool purePrefix;
+            string prefix = simpleRegex(regex, flags, &purePrefix);
+            if (purePrefix)
+                rm._prefix = prefix;
+        }
+    }
+
+    bool Matcher::addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags ) {
+        const char *fn = fe.fieldName();
+        int op = fe.getGtLtOp( -1 );
+        if ( op == -1 ) {
+            if ( !isNot && fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'f' && fn[4] == 0 ) {
+                return false; // { $ref : xxx } - treat as normal object
+            }
+            uassert( 10068 ,  (string)"invalid operator: " + fn , op != -1 );
+        }
+
+        switch ( op ) {
+        case BSONObj::GT:
+        case BSONObj::GTE:
+        case BSONObj::LT:
+        case BSONObj::LTE: {
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            addBasic(b->done().firstElement(), op, isNot);
+            break;
+        }
+        case BSONObj::NE: {
+            _haveNeg = true;
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            addBasic(b->done().firstElement(), BSONObj::NE, isNot);
+            break;
+        }
+        case BSONObj::opALL:
+            _all = true;
+        case BSONObj::opIN: {
+            uassert( 13276 , "$in needs an array" , fe.isABSONObj() );
+            _basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
+            BSONObjIterator i( fe.embeddedObject() );
+            while( i.more() ) {
+                if ( i.next().type() == Array ) {
+                    _hasArray = true;
+                }
+            }
+            break;
+        }
+        case BSONObj::NIN:
+            uassert( 13277 , "$nin needs an array" , fe.isABSONObj() );
+            _haveNeg = true;
+            _basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) );
+            break;
+        case BSONObj::opMOD:
+        case BSONObj::opTYPE:
+        case BSONObj::opELEM_MATCH: {
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            // these are types where ElementMatcher has all the info
+            _basics.push_back( ElementMatcher( b->done().firstElement() , op, isNot ) );
+            break;
+        }
+        case BSONObj::opSIZE: {
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            addBasic(b->done().firstElement(), BSONObj::opSIZE, isNot);
+            _haveSize = true;
+            break;
+        }
+        case BSONObj::opEXISTS: {
+            shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() );
+            _builders.push_back( b );
+            b->appendAs(fe, e.fieldName());
+            addBasic(b->done().firstElement(), BSONObj::opEXISTS, isNot);
+            break;
+        }
+        case BSONObj::opREGEX: {
+            uassert( 13032, "can't use $not with $regex, use BSON regex type instead", !isNot );
+            if ( fe.type() == RegEx ) {
+                regex = fe.regex();
+                flags = fe.regexFlags();
+            }
+            else {
+                regex = fe.valuestrsafe();
+            }
+            break;
+        }
+        case BSONObj::opOPTIONS: {
+            uassert( 13029, "can't use $not with $options, use BSON regex type instead", !isNot );
+            flags = fe.valuestrsafe();
+            break;
+        }
+        case BSONObj::opNEAR:
+        case BSONObj::opWITHIN:
+        case BSONObj::opMAX_DISTANCE:
+            break;
+        default:
+            uassert( 10069 ,  (string)"BUG - can't operator for: " + fn , 0 );
+        }
+        return true;
+    }
+
+    void Matcher::parseExtractedClause( const BSONElement &e, list< shared_ptr< Matcher > > &matchers ) {
+        uassert( 13086, "$and/$or/$nor must be a nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 );
+        BSONObjIterator j( e.embeddedObject() );
+        while( j.more() ) {
+            BSONElement f = j.next();
+            uassert( 13087, "$and/$or/$nor match element must be an object", f.type() == Object );
+            matchers.push_back( shared_ptr< Matcher >( new Matcher( f.embeddedObject(), true ) ) );
+        }
+    }
+
+    bool Matcher::parseClause( const BSONElement &e ) {
+        const char *ef = e.fieldName();
+
+        if ( ef[ 0 ] != '$' )
+            return false;
+        
+        // $and
+        if ( ef[ 1 ] == 'a' && ef[ 2 ] == 'n' && ef[ 3 ] == 'd' ) {
+            parseExtractedClause( e, _andMatchers );
+            return true;
+        }
+
+        // $or
+        if ( ef[ 1 ] == 'o' && ef[ 2 ] == 'r' && ef[ 3 ] == 0 ) {
+            parseExtractedClause( e, _orMatchers );
+            return true;
+        }
+        
+        // $nor
+        if ( ef[ 1 ] == 'n' && ef[ 2 ] == 'o' && ef[ 3 ] == 'r' && ef[ 4 ] == 0 ) {
+            parseExtractedClause( e, _norMatchers );
+            return true;
+        }
+        
+        // $comment
+        if ( ef[ 1 ] == 'c' && ef[ 2 ] == 'o' && ef[ 3 ] == 'm' && str::equals( ef , "$comment" ) ) {
+            return true;
+        }
+
+        return false;
+    }
+
+    // $where: function()...
+    NOINLINE_DECL void Matcher::parseWhere( const BSONElement &e ) { 
+        uassert(15902 , "$where expression has an unexpected type", e.type() == String || e.type() == CodeWScope || e.type() == Code );
+        uassert( 10066 , "$where may only appear once in query", _where == 0 );
+        uassert( 10067 , "$where query, but no script engine", globalScriptEngine );
+        massert( 13089 , "no current client needed for $where" , haveClient() );
+        _where = new Where();
+        _where->scope = globalScriptEngine->getPooledScope( cc().ns() );
+        _where->scope->localConnect( cc().database()->name.c_str() );
+            
+        if ( e.type() == CodeWScope ) {
+            _where->setFunc( e.codeWScopeCode() );
+            _where->jsScope = new BSONObj( e.codeWScopeScopeData() );
+        }
+        else {
+            const char *code = e.valuestr();
+            _where->setFunc(code);
+        }
+            
+        _where->scope->execSetup( "_mongo.readOnly = true;" , "make read only" );
+    }
+    
+    void Matcher::parseMatchExpressionElement( const BSONElement &e, bool nested ) {
+        
+        uassert( 13629 , "can't have undefined in a query expression" , e.type() != Undefined );
+        
+        if ( parseClause( e ) ) {
+            return;   
+        }
+
+        const char *fn = e.fieldName();
+        if ( str::equals(fn, "$where") ) {
+            parseWhere(e);
+            return;
+        }
+
+        if ( e.type() == RegEx ) {
+            addRegex( fn, e.regex(), e.regexFlags() );
+            return;
+        }
+        
+        // greater than / less than...
+        // e.g., e == { a : { $gt : 3 } }
+        //       or
+        //            { a : { $in : [1,2,3] } }
+        if ( e.type() == Object ) {
+            // support {$regex:"a|b", $options:"imx"}
+            const char* regex = NULL;
+            const char* flags = "";
+            
+            // e.g., fe == { $gt : 3 }
+            BSONObjIterator j(e.embeddedObject());
+            bool isOperator = false;
+            while ( j.more() ) {
+                BSONElement fe = j.next();
+                const char *fn = fe.fieldName();
+                
+                if ( fn[0] == '$' && fn[1] ) {
+                    isOperator = true;
+                    
+                    if ( fn[1] == 'n' && fn[2] == 'o' && fn[3] == 't' && fn[4] == 0 ) {
+                        _haveNeg = true;
+                        switch( fe.type() ) {
+                            case Object: {
+                                BSONObjIterator k( fe.embeddedObject() );
+                                uassert( 13030, "$not cannot be empty", k.more() );
+                                while( k.more() ) {
+                                    addOp( e, k.next(), true, regex, flags );
+                                }
+                                break;
+                            }
+                            case RegEx:
+                                addRegex( e.fieldName(), fe.regex(), fe.regexFlags(), true );
+                                break;
+                            default:
+                                uassert( 13031, "invalid use of $not", false );
+                        }
+                    }
+                    else {
+                        if ( !addOp( e, fe, false, regex, flags ) ) {
+                            isOperator = false;
+                            break;
+                        }
+                    }
+                }
+                else {
+                    isOperator = false;
+                    break;
+                }
+            }
+            if (regex) {
+                addRegex(e.fieldName(), regex, flags);
+            }
+            if ( isOperator )
+                return;
+        }
+        
+        if ( e.type() == Array ) {
+            _hasArray = true;
+        }
+        else if( *fn == '$' ) {
+            if( str::equals(fn, "$atomic") || str::equals(fn, "$isolated") ) {
+                uassert( 14844, "$atomic specifier must be a top level field", !nested );
+                _atomic = e.trueValue();
+                return;
+            }
+        }
+        
+        // normal, simple case e.g. { a : "foo" }
+        addBasic(e, BSONObj::Equality, false);
+    }
+    
+    /* _jsobj          - the query pattern
+    */
+    Matcher::Matcher(const BSONObj &jsobj, bool nested) :
+        _where(0), _jsobj(jsobj), _haveSize(), _all(), _hasArray(0), _haveNeg(), _atomic(false) {
+
+        BSONObjIterator i(_jsobj);
+        while ( i.more() ) {
+            parseMatchExpressionElement( i.next(), nested );
+        }
+    }
+
+    Matcher::Matcher( const Matcher &docMatcher, const BSONObj &key ) :
+        _where(0), _constrainIndexKey( key ), _haveSize(), _all(), _hasArray(0), _haveNeg(), _atomic(false) {
+        // Filter out match components that will provide an incorrect result
+        // given a key from a single key index.
+        for( vector< ElementMatcher >::const_iterator i = docMatcher._basics.begin(); i != docMatcher._basics.end(); ++i ) {
+            if ( key.hasField( i->_toMatch.fieldName() ) ) {
+                switch( i->_compareOp ) {
+                case BSONObj::opSIZE:
+                case BSONObj::opALL:
+                case BSONObj::NE:
+                case BSONObj::NIN:
+                case BSONObj::opEXISTS: // We can't match on index in this case.
+                case BSONObj::opTYPE: // For $type:10 (null), a null key could be a missing field or a null value field.
+                    break;
+                case BSONObj::opIN: {
+                    bool inContainsArray = false;
+                    for( set<BSONElement,element_lt>::const_iterator j = i->_myset->begin(); j != i->_myset->end(); ++j ) {
+                        if ( j->type() == Array ) {
+                            inContainsArray = true;
+                            break;
+                        }
+                    }
+                    // Can't match an array to its first indexed element.
+                    if ( !i->_isNot && !inContainsArray ) {
+                        _basics.push_back( *i );
+                    }
+                    break;
+                }
+                default: {
+                    // Can't match an array to its first indexed element.
+                    if ( !i->_isNot && i->_toMatch.type() != Array ) {
+                        _basics.push_back( *i );
+                    }
+                }
+                }
+            }
+        }
+        for( vector<RegexMatcher>::const_iterator it = docMatcher._regexs.begin();
+            it != docMatcher._regexs.end();
+            ++it) {
+            if ( !it->_isNot && key.hasField( it->_fieldName ) ) {
+                _regexs.push_back(*it);
+            }
+        }
+        // Recursively filter match components for and and or matchers.
+        for( list< shared_ptr< Matcher > >::const_iterator i = docMatcher._andMatchers.begin(); i != docMatcher._andMatchers.end(); ++i ) {
+            _andMatchers.push_back( shared_ptr< Matcher >( new Matcher( **i, key ) ) );
+        }
+        for( list< shared_ptr< Matcher > >::const_iterator i = docMatcher._orMatchers.begin(); i != docMatcher._orMatchers.end(); ++i ) {
+            _orMatchers.push_back( shared_ptr< Matcher >( new Matcher( **i, key ) ) );
+        }
+    }
+
+    inline bool regexMatches(const RegexMatcher& rm, const BSONElement& e) {
+        switch (e.type()) {
+        case String:
+        case Symbol:
+            if (rm._prefix.empty())
+                return rm._re->PartialMatch(e.valuestr());
+            else
+                return !strncmp(e.valuestr(), rm._prefix.c_str(), rm._prefix.size());
+        case RegEx:
+            return !strcmp(rm._regex, e.regex()) && !strcmp(rm._flags, e.regexFlags());
+        default:
+            return false;
+        }
+    }
+
+    inline int Matcher::valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) const {
+        assert( op != BSONObj::NE && op != BSONObj::NIN );
+
+        if ( op == BSONObj::Equality ) {
+            return l.valuesEqual(r);
+        }
+
+        if ( op == BSONObj::opIN ) {
+            // { $in : [1,2,3] }
+            int count = bm._myset->count(l);
+            if ( count )
+                return count;
+            if ( bm._myregex.get() ) {
+                for( vector<RegexMatcher>::const_iterator i = bm._myregex->begin(); i != bm._myregex->end(); ++i ) {
+                    if ( regexMatches( *i, l ) ) {
+                        return true;
+                    }
+                }
+            }
+        }
+
+        if ( op == BSONObj::opSIZE ) {
+            if ( l.type() != Array )
+                return 0;
+            int count = 0;
+            BSONObjIterator i( l.embeddedObject() );
+            while( i.moreWithEOO() ) {
+                BSONElement e = i.next();
+                if ( e.eoo() )
+                    break;
+                ++count;
+            }
+            return count == r.number();
+        }
+
+        if ( op == BSONObj::opMOD ) {
+            if ( ! l.isNumber() )
+                return false;
+
+            return l.numberLong() % bm._mod == bm._modm;
+        }
+
+        if ( op == BSONObj::opTYPE ) {
+            return bm._type == l.type();
+        }
+
+        /* check LT, GTE, ... */
+        if ( l.canonicalType() != r.canonicalType() )
+            return false;
+        int c = compareElementValues(l, r);
+        if ( c < -1 ) c = -1;
+        if ( c > 1 ) c = 1;
+        int z = 1 << (c+1);
+        return (op & z);
+    }
+
+    int Matcher::inverseMatch(const char *fieldName, const BSONElement &toMatch, const BSONObj &obj, const ElementMatcher& bm , MatchDetails * details ) const {
+        int inverseRet = matchesDotted( fieldName, toMatch, obj, bm.inverseOfNegativeCompareOp(), bm , false , details );
+        if ( bm.negativeCompareOpContainsNull() ) {
+            return ( inverseRet <= 0 ) ? 1 : 0;
+        }
+        return -inverseRet;
+    }
+
+    int retExistsFound( const ElementMatcher &bm ) {
+        return bm._toMatch.trueValue() ? 1 : -1;
+    }
+
+    /* Check if a particular field matches.
+
+       fieldName - field to match "a.b" if we are reaching into an embedded object.
+       toMatch   - element we want to match.
+       obj       - database object to check against
+       compareOp - Equality, LT, GT, etc.  This may be different than, and should supersede, the compare op in em. 
+       isArr     -
+
+       Special forms:
+
+         { "a.b" : 3 }             means       obj.a.b == 3
+         { a : { $lt : 3 } }       means       obj.a < 3
+         { a : { $in : [1,2] } }   means       [1,2].contains(obj.a)
+
+         return value
+       -1 mismatch
+        0 missing element
+        1 match
+    */
+    int Matcher::matchesDotted(const char *fieldName, const BSONElement& toMatch, const BSONObj& obj, int compareOp, const ElementMatcher& em , bool isArr, MatchDetails * details ) const {
+        DEBUGMATCHER( "\t matchesDotted : " << fieldName << " hasDetails: " << ( details ? "yes" : "no" ) );
+
+        if ( compareOp == BSONObj::opALL ) {
+
+            if ( em._allMatchers.size() ) {
+                // $all query matching will not be performed against indexes, so the field
+                // to match is always extracted from the full document.
+                BSONElement e = obj.getFieldDotted( fieldName );
+                // The $all/$elemMatch operator only matches arrays.
+                if ( e.type() != Array ) {
+                    return -1;
+                }
+
+                for ( unsigned i=0; i<em._allMatchers.size(); i++ ) {
+                    bool found = false;
+                    BSONObjIterator x( e.embeddedObject() );
+                    while ( x.more() ) {
+                        BSONElement f = x.next();
+
+                        if ( f.type() != Object )
+                            continue;
+                        if ( em._allMatchers[i]->matches( f.embeddedObject() ) ) {
+                            found = true;
+                            break;
+                        }
+                    }
+
+                    if ( ! found )
+                        return -1;
+                }
+
+                return 1;
+            }
+
+            if ( em._myset->size() == 0 && !em._myregex.get() )
+                return -1; // is this desired?
+
+            BSONElementSet myValues;
+            obj.getFieldsDotted( fieldName , myValues );
+
+            for( set< BSONElement, element_lt >::const_iterator i = em._myset->begin(); i != em._myset->end(); ++i ) {
+                // ignore nulls
+                if ( i->type() == jstNULL )
+                    continue;
+
+                if ( myValues.count( *i ) == 0 )
+                    return -1;
+            }
+
+            if ( !em._myregex.get() )
+                return 1;
+
+            for( vector< RegexMatcher >::const_iterator i = em._myregex->begin(); i != em._myregex->end(); ++i ) {
+                bool match = false;
+                for( BSONElementSet::const_iterator j = myValues.begin(); j != myValues.end(); ++j ) {
+                    if ( regexMatches( *i, *j ) ) {
+                        match = true;
+                        break;
+                    }
+                }
+                if ( !match )
+                    return -1;
+            }
+
+            return 1;
+        } // end opALL
+
+        if ( compareOp == BSONObj::NE || compareOp == BSONObj::NIN ) {
+            return inverseMatch( fieldName, toMatch, obj, em , details );
+        }
+
+        BSONElement e;
+        bool indexed = !_constrainIndexKey.isEmpty();
+        if ( indexed ) {
+            e = obj.getFieldUsingIndexNames(fieldName, _constrainIndexKey);
+            if( e.eoo() ) {
+                cout << "obj: " << obj << endl;
+                cout << "fieldName: " << fieldName << endl;
+                cout << "_constrainIndexKey: " << _constrainIndexKey << endl;
+                assert( !e.eoo() );
+            }
+        }
+        else {
+
+            const char *p = strchr(fieldName, '.');
+            if ( p ) {
+                string left(fieldName, p-fieldName);
+
+                BSONElement se = obj.getField(left.c_str());
+                if ( se.eoo() )
+                    ;
+                else if ( se.type() != Object && se.type() != Array )
+                    ;
+                else {
+                    BSONObj eo = se.embeddedObject();
+                    return matchesDotted(p+1, toMatch, eo, compareOp, em, se.type() == Array , details );
+                }
+            }
+
+            // An array was encountered while scanning for components of the field name.
+            if ( isArr ) {
+                DEBUGMATCHER( "\t\t isArr 1 : obj : " << obj );
+                BSONObjIterator ai(obj);
+                bool found = false;
+                while ( ai.moreWithEOO() ) {
+                    BSONElement z = ai.next();
+
+                    if( strcmp(z.fieldName(),fieldName) == 0 ) {
+                        if ( compareOp == BSONObj::opEXISTS ) {
+                         	return retExistsFound( em );
+                        }
+                        if (valuesMatch(z, toMatch, compareOp, em) ) {
+	                        // "field.<n>" array notation was used
+    	                    if ( details )
+        	                    details->_elemMatchKey = z.fieldName();
+            	            return 1;
+                        }
+                    }
+
+                    if ( z.type() == Object ) {
+                        BSONObj eo = z.embeddedObject();
+                        int cmp = matchesDotted(fieldName, toMatch, eo, compareOp, em, false, details );
+                        if ( cmp > 0 ) {
+                            if ( details )
+                                details->_elemMatchKey = z.fieldName();
+                            return 1;
+                        }
+                        else if ( cmp < 0 ) {
+                            found = true;
+                        }
+                    }
+                }
+                return found ? -1 : 0;
+            }
+
+            if( p ) {
+                // Left portion of field name was not found or wrong type.
+                return 0;
+            }
+            else {
+                e = obj.getField(fieldName);
+            }
+        }
+
+        if ( compareOp == BSONObj::opEXISTS ) {
+            if( e.eoo() ) {
+             	return 0;
+            } else {
+             	return retExistsFound( em );   
+            }
+        }
+        else if ( ( e.type() != Array || indexed || compareOp == BSONObj::opSIZE ) &&
+                  valuesMatch(e, toMatch, compareOp, em ) ) {
+            return 1;
+        }
+        else if ( e.type() == Array && compareOp != BSONObj::opSIZE ) {
+            BSONObjIterator ai(e.embeddedObject());
+
+            while ( ai.moreWithEOO() ) {
+                BSONElement z = ai.next();
+
+                if ( compareOp == BSONObj::opELEM_MATCH ) {
+                    if ( z.type() == Object ) {
+                        if ( em._subMatcher->matches( z.embeddedObject() ) ) {
+                            if ( details )
+                                details->_elemMatchKey = z.fieldName();
+                            return 1;
+                        }
+                    }
+                    else if ( em._subMatcherOnPrimitives ) {
+                        if ( z.type() && em._subMatcher->matches( z.wrap( "" ) ) ) {
+                            if ( details )
+                                details->_elemMatchKey = z.fieldName();
+                            return 1;
+                        }
+                    }
+                }
+                else {
+                    if ( valuesMatch( z, toMatch, compareOp, em) ) {
+                        if ( details )
+                            details->_elemMatchKey = z.fieldName();
+                        return 1;
+                    }
+                }
+
+            }
+
+            // match an entire array to itself
+            if ( compareOp == BSONObj::Equality && e.woCompare( toMatch , false ) == 0 ) {
+                return 1;
+            }
+            if ( compareOp == BSONObj::opIN && valuesMatch( e, toMatch, compareOp, em ) ) {
+             	return 1;
+            }
+        }
+        else if ( e.eoo() ) {
+            return 0;
+        }
+        return -1;
+    }
+
+    extern int dump;
+
+    /* See if an object matches the query.
+    */
+    bool Matcher::matches(const BSONObj& jsobj , MatchDetails * details ) const {
+        LOG(5) << "Matcher::matches() " << jsobj.toString() << endl;
+
+        /* assuming there is usually only one thing to match.  if more this
+           could be slow sometimes. */
+
+        // check normal non-regex cases:
+        for ( unsigned i = 0; i < _basics.size(); i++ ) {
+            const ElementMatcher& bm = _basics[i];
+            const BSONElement& m = bm._toMatch;
+            // -1=mismatch. 0=missing element. 1=match
+            int cmp = matchesDotted(m.fieldName(), m, jsobj, bm._compareOp, bm , false , details );
+            if ( cmp == 0 && bm._compareOp == BSONObj::opEXISTS ) {
+                // If missing, match cmp is opposite of $exists spec.
+                cmp = -retExistsFound(bm);
+            }
+            if ( bm._isNot )
+                cmp = -cmp;
+            if ( cmp < 0 )
+                return false;
+            if ( cmp == 0 ) {
+                /* missing is ok iff we were looking for null */
+                if ( m.type() == jstNULL || m.type() == Undefined ||
+                    ( ( bm._compareOp == BSONObj::opIN || bm._compareOp == BSONObj::NIN ) && bm._myset->count( staticNull.firstElement() ) > 0 ) ) {
+                    if ( bm.negativeCompareOp() ^ bm._isNot ) {
+                        return false;
+                    }
+                }
+                else {
+                    if ( !bm._isNot ) {
+                        return false;
+                    }
+                }
+            }
+        }
+
+        for (vector<RegexMatcher>::const_iterator it = _regexs.begin();
+	     it != _regexs.end();
+	     ++it) {
+            BSONElementSet s;
+            if ( !_constrainIndexKey.isEmpty() ) {
+                BSONElement e = jsobj.getFieldUsingIndexNames(it->_fieldName, _constrainIndexKey);
+
+                // Should only have keys nested one deep here, for geo-indices
+                // TODO: future indices may nest deeper?
+                if( e.type() == Array ){
+                	BSONObjIterator i( e.Obj() );
+                	while( i.more() ){
+                		s.insert( i.next() );
+                	}
+                }
+                else if ( !e.eoo() )
+                    s.insert( e );
+
+            }
+            else {
+                jsobj.getFieldsDotted( it->_fieldName, s );
+            }
+            bool match = false;
+            for( BSONElementSet::const_iterator i = s.begin(); i != s.end(); ++i )
+                if ( regexMatches(*it, *i) )
+                    match = true;
+            if ( !match ^ it->_isNot )
+                return false;
+        }
+
+        if ( _orDedupConstraints.size() > 0 ) {
+            for( vector< shared_ptr< FieldRangeVector > >::const_iterator i = _orDedupConstraints.begin();
+                i != _orDedupConstraints.end(); ++i ) {
+                if ( (*i)->matches( jsobj ) ) {
+                    return false;
+                }
+            }
+        }
+        
+        if ( _andMatchers.size() > 0 ) {
+            for( list< shared_ptr< Matcher > >::const_iterator i = _andMatchers.begin();
+                i != _andMatchers.end(); ++i ) {
+                // SERVER-3192 Track field matched using details the same as for
+                // top level fields, at least for now.
+                if ( !(*i)->matches( jsobj, details ) ) {
+                    return false;
+                }
+            }
+        }
+
+        if ( _orMatchers.size() > 0 ) {
+            bool match = false;
+            for( list< shared_ptr< Matcher > >::const_iterator i = _orMatchers.begin();
+                    i != _orMatchers.end(); ++i ) {
+                // SERVER-205 don't submit details - we don't want to track field
+                // matched within $or
+                if ( (*i)->matches( jsobj ) ) {
+                    match = true;
+                    break;
+                }
+            }
+            if ( !match ) {
+                return false;
+            }
+        }
+
+        if ( _norMatchers.size() > 0 ) {
+            for( list< shared_ptr< Matcher > >::const_iterator i = _norMatchers.begin();
+                    i != _norMatchers.end(); ++i ) {
+                // SERVER-205 don't submit details - we don't want to track field
+                // matched within $nor
+                if ( (*i)->matches( jsobj ) ) {
+                    return false;
+                }
+            }
+        }
+
+        if ( _where ) {
+            if ( _where->func == 0 ) {
+                uassert( 10070 , "$where compile error", false);
+                return false; // didn't compile
+            }
+
+            if ( _where->jsScope ) {
+                _where->scope->init( _where->jsScope );
+            }
+            _where->scope->setObject( "obj", const_cast< BSONObj & >( jsobj ) );
+            _where->scope->setBoolean( "fullObject" , true ); // this is a hack b/c fullObject used to be relevant
+
+            int err = _where->scope->invoke( _where->func , 0, &jsobj , 1000 * 60 , false );
+            if ( err == -3 ) { // INVOKE_ERROR
+                stringstream ss;
+                ss << "error on invocation of $where function:\n"
+                   << _where->scope->getError();
+                uassert( 10071 , ss.str(), false);
+                return false;
+            }
+            else if ( err != 0 ) {   // ! INVOKE_SUCCESS
+                uassert( 10072 , "unknown error in invocation of $where function", false);
+                return false;
+            }
+            return _where->scope->getBoolean( "return" ) != 0;
+
+        }
+
+        return true;
+    }
+
+    bool Matcher::keyMatch( const Matcher &docMatcher ) const {
+        // Quick check certain non key match cases.
+        if ( docMatcher._all
+        	|| docMatcher._haveSize
+        	|| docMatcher._hasArray // We can't match an array to its first indexed element using keymatch
+        	|| docMatcher._haveNeg ) {
+         	return false;   
+        }
+        
+        // Check that all match components are available in the index matcher.
+        if ( !( _basics.size() == docMatcher._basics.size() && _regexs.size() == docMatcher._regexs.size() && !docMatcher._where ) ) {
+            return false;
+        }
+        if ( _andMatchers.size() != docMatcher._andMatchers.size() ) {
+            return false;
+        }
+        if ( _orMatchers.size() != docMatcher._orMatchers.size() ) {
+            return false;
+        }
+        if ( docMatcher._norMatchers.size() > 0 ) {
+            return false;
+        }
+        if ( docMatcher._orDedupConstraints.size() > 0 ) {
+            return false;
+        }
+        
+        // Recursively check that all submatchers support key match.
+        {
+            list< shared_ptr< Matcher > >::const_iterator i = _andMatchers.begin();
+            list< shared_ptr< Matcher > >::const_iterator j = docMatcher._andMatchers.begin();
+            while( i != _andMatchers.end() ) {
+                if ( !(*i)->keyMatch( **j ) ) {
+                    return false;
+                }
+                ++i; ++j;
+            }
+        }
+        {
+            list< shared_ptr< Matcher > >::const_iterator i = _orMatchers.begin();
+            list< shared_ptr< Matcher > >::const_iterator j = docMatcher._orMatchers.begin();
+            while( i != _orMatchers.end() ) {
+                if ( !(*i)->keyMatch( **j ) ) {
+                    return false;
+                }
+                ++i; ++j;
+            }
+        }
+        // Nor matchers and or dedup constraints aren't created for index matchers,
+        // so no need to check those here.
+        return true;
+    }
+
+
+    /*- just for testing -- */
+#pragma pack(1)
+    struct JSObj1 {
+        JSObj1() {
+            totsize=sizeof(JSObj1);
+            n = NumberDouble;
+            strcpy_s(nname, 5, "abcd");
+            N = 3.1;
+            s = String;
+            strcpy_s(sname, 7, "abcdef");
+            slen = 10;
+            strcpy_s(sval, 10, "123456789");
+            eoo = EOO;
+        }
+        unsigned totsize;
+
+        char n;
+        char nname[5];
+        double N;
+
+        char s;
+        char sname[7];
+        unsigned slen;
+        char sval[10];
+
+        char eoo;
+    };
+#pragma pack()
+
+    struct JSObj1 js1;
+
+#pragma pack(1)
+    struct JSObj2 {
+        JSObj2() {
+            totsize=sizeof(JSObj2);
+            s = String;
+            strcpy_s(sname, 7, "abcdef");
+            slen = 10;
+            strcpy_s(sval, 10, "123456789");
+            eoo = EOO;
+        }
+        unsigned totsize;
+        char s;
+        char sname[7];
+        unsigned slen;
+        char sval[10];
+        char eoo;
+    } js2;
+
+    struct JSUnitTest : public UnitTest {
+        void run() {
+
+            BSONObj j1((const char *) &js1);
+            BSONObj j2((const char *) &js2);
+            Matcher m(j2);
+            assert( m.matches(j1) );
+            js2.sval[0] = 'z';
+            assert( !m.matches(j1) );
+            Matcher n(j1);
+            assert( n.matches(j1) );
+            assert( !n.matches(j2) );
+
+            BSONObj j0 = BSONObj();
+//      BSONObj j0((const char *) &js0);
+            Matcher p(j0);
+            assert( p.matches(j1) );
+            assert( p.matches(j2) );
+        }
+    } jsunittest;
+
+#pragma pack()
+
+    struct RXTest : public UnitTest {
+
+        RXTest() {
+        }
+
+        void run() {
+            /*
+            static const boost::regex e("(\\d{4}[- ]){3}\\d{4}");
+            static const boost::regex b(".....");
+            out() << "regex result: " << regex_match("hello", e) << endl;
+            out() << "regex result: " << regex_match("abcoo", b) << endl;
+            */
+
+            int ret = 0;
+
+            pcre_config( PCRE_CONFIG_UTF8 , &ret );
+            massert( 10342 ,  "pcre not compiled with utf8 support" , ret );
+
+            pcrecpp::RE re1(")({a}h.*o");
+            pcrecpp::RE re("h.llo");
+            assert( re.FullMatch("hello") );
+            assert( !re1.FullMatch("hello") );
+
+
+            pcrecpp::RE_Options options;
+            options.set_utf8(true);
+            pcrecpp::RE part("dwi", options);
+            assert( part.PartialMatch("dwight") );
+
+            pcre_config( PCRE_CONFIG_UNICODE_PROPERTIES , &ret );
+            if ( ! ret )
+                cout << "warning: some regex utf8 things will not work.  pcre build doesn't have --enable-unicode-properties" << endl;
+
+        }
+    } rxtest;
+
+} // namespace mongo
diff --git a/src/mongo/db/matcher.h b/src/mongo/db/matcher.h
new file mode 100644
index 00000000000..b6994a79229
--- /dev/null
+++ b/src/mongo/db/matcher.h
@@ -0,0 +1,276 @@
+// matcher.h
+
+/* Matcher is our boolean expression evaluator for "where" clauses */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "jsobj.h"
+#include "pcrecpp.h"
+
+namespace mongo {
+
+    class Cursor;
+    class CoveredIndexMatcher;
+    class Matcher;
+    class FieldRangeVector;
+
+    class RegexMatcher {
+    public:
+        const char *_fieldName;
+        const char *_regex;
+        const char *_flags;
+        string _prefix;
+        shared_ptr< pcrecpp::RE > _re;
+        bool _isNot;
+        RegexMatcher() : _isNot() {}
+    };
+
+    struct element_lt {
+        bool operator()(const BSONElement& l, const BSONElement& r) const {
+            int x = (int) l.canonicalType() - (int) r.canonicalType();
+            if ( x < 0 ) return true;
+            else if ( x > 0 ) return false;
+            return compareElementValues(l,r) < 0;
+        }
+    };
+
+
+    class ElementMatcher {
+    public:
+
+        ElementMatcher() {
+        }
+
+        ElementMatcher( BSONElement e , int op, bool isNot );
+
+        ElementMatcher( BSONElement e , int op , const BSONObj& array, bool isNot );
+
+        ~ElementMatcher() { }
+        
+        bool negativeCompareOp() const { return _compareOp == BSONObj::NE || _compareOp == BSONObj::NIN; }
+        int inverseOfNegativeCompareOp() const;
+        bool negativeCompareOpContainsNull() const;
+        
+        BSONElement _toMatch;
+        int _compareOp;
+        bool _isNot;
+        shared_ptr< set<BSONElement,element_lt> > _myset;
+        shared_ptr< vector<RegexMatcher> > _myregex;
+
+        // these are for specific operators
+        int _mod;
+        int _modm;
+        BSONType _type;
+
+        shared_ptr<Matcher> _subMatcher;
+        bool _subMatcherOnPrimitives ;
+
+        vector< shared_ptr<Matcher> > _allMatchers;
+    };
+
+    class Where; // used for $where javascript eval
+    class DiskLoc;
+
+    struct MatchDetails {
+        MatchDetails() {
+            reset();
+        }
+
+        void reset() {
+            _loadedObject = false;
+            _elemMatchKey = 0;
+        }
+
+        string toString() const {
+            stringstream ss;
+            ss << "loadedObject: " << _loadedObject << " ";
+            ss << "elemMatchKey: " << ( _elemMatchKey ? _elemMatchKey : "NULL" ) << " ";
+            return ss.str();
+        }
+
+        bool _loadedObject;
+        const char * _elemMatchKey; // warning, this may go out of scope if matched object does
+    };
+
+    /* Match BSON objects against a query pattern.
+
+       e.g.
+           db.foo.find( { a : 3 } );
+
+       { a : 3 } is the pattern object.  See wiki documentation for full info.
+
+       GT/LT:
+         { a : { $gt : 3 } }
+       Not equal:
+         { a : { $ne : 3 } }
+
+       TODO: we should rewrite the matcher to be more an AST style.
+    */
+    class Matcher : boost::noncopyable {
+        int matchesDotted(
+            const char *fieldName,
+            const BSONElement& toMatch, const BSONObj& obj,
+            int compareOp, const ElementMatcher& bm, bool isArr , MatchDetails * details ) const;
+
+        /**
+         * Perform a NE or NIN match by returning the inverse of the opposite matching operation.
+         * Missing values are considered matches unless the match must not equal null.
+         */
+        int inverseMatch(
+            const char *fieldName,
+            const BSONElement &toMatch, const BSONObj &obj,
+            const ElementMatcher&bm, MatchDetails * details ) const;
+
+    public:
+        static int opDirection(int op) {
+            return op <= BSONObj::LTE ? -1 : 1;
+        }
+
+        Matcher(const BSONObj &pattern, bool nested=false);
+
+        ~Matcher();
+
+        bool matches(const BSONObj& j, MatchDetails * details = 0 ) const;
+
+        bool atomic() const { return _atomic; }
+
+        string toString() const {
+            return _jsobj.toString();
+        }
+
+        void addOrDedupConstraint( const shared_ptr< FieldRangeVector > &frv ) {
+            _orDedupConstraints.push_back( frv );
+        }
+
+        void popOrClause() {
+            _orMatchers.pop_front();
+        }
+
+        /**
+         * @return true if this key matcher will return the same true/false
+         * value as the provided doc matcher.
+         */
+        bool keyMatch( const Matcher &docMatcher ) const;
+        
+        bool singleSimpleCriterion() const {
+            return false; // TODO SERVER-958
+//            // TODO Really check, especially if all basics are ok.
+//            // $all, etc
+//            // _orConstraints?
+//            return ( ( basics.size() + nRegex ) < 2 ) && !where && !_orMatchers.size() && !_norMatchers.size();
+        }
+
+	const BSONObj *getQuery() const { return &_jsobj; };
+
+    private:
+        /**
+         * Generate a matcher for the provided index key format using the
+         * provided full doc matcher.
+         */
+        Matcher( const Matcher &docMatcher, const BSONObj &constrainIndexKey );
+
+        void addBasic(const BSONElement &e, int c, bool isNot) {
+            // TODO May want to selectively ignore these element types based on op type.
+            if ( e.type() == MinKey || e.type() == MaxKey )
+                return;
+            _basics.push_back( ElementMatcher( e , c, isNot ) );
+        }
+
+        void addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot = false);
+        bool addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags );
+
+        int valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) const;
+
+        bool parseClause( const BSONElement &e );
+        void parseExtractedClause( const BSONElement &e, list< shared_ptr< Matcher > > &matchers );
+
+        void parseWhere( const BSONElement &e );
+        void parseMatchExpressionElement( const BSONElement &e, bool nested );
+        
+        Where *_where;                    // set if query uses $where
+        BSONObj _jsobj;                  // the query pattern.  e.g., { name: "joe" }
+        BSONObj _constrainIndexKey;
+        vector<ElementMatcher> _basics;
+        bool _haveSize;
+        bool _all;
+        bool _hasArray;
+        bool _haveNeg;
+
+        /* $atomic - if true, a multi document operation (some removes, updates)
+                     should be done atomically.  in that case, we do not yield -
+                     i.e. we stay locked the whole time.
+                     http://www.mongodb.org/display/DOCS/Removing[
+        */
+        bool _atomic;
+
+        vector<RegexMatcher> _regexs;
+
+        // so we delete the mem when we're done:
+        vector< shared_ptr< BSONObjBuilder > > _builders;
+        list< shared_ptr< Matcher > > _andMatchers;
+        list< shared_ptr< Matcher > > _orMatchers;
+        list< shared_ptr< Matcher > > _norMatchers;
+        vector< shared_ptr< FieldRangeVector > > _orDedupConstraints;
+
+        friend class CoveredIndexMatcher;
+    };
+
+    // If match succeeds on index key, then attempt to match full document.
+    class CoveredIndexMatcher : boost::noncopyable {
+    public:
+        CoveredIndexMatcher(const BSONObj &pattern, const BSONObj &indexKeyPattern , bool alwaysUseRecord=false );
+        bool matches(const BSONObj &o) { return _docMatcher->matches( o ); }
+        bool matchesWithSingleKeyIndex(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 ) {
+            return matches( key, recLoc, details, true );   
+        }
+        /**
+         * This is the preferred method for matching against a cursor, as it
+         * can handle both multi and single key cursors.
+         */
+        bool matchesCurrent( Cursor * cursor , MatchDetails * details = 0 );
+        bool needRecord() { return _needRecord; }
+
+        Matcher& docMatcher() { return *_docMatcher; }
+
+        // once this is called, shouldn't use this matcher for matching any more
+        void advanceOrClause( const shared_ptr< FieldRangeVector > &frv ) {
+            _docMatcher->addOrDedupConstraint( frv );
+            // TODO this is not yet optimal.  Since we could skip an entire
+            // or clause (if a match is impossible) between calls to advanceOrClause()
+            // we may not pop all the clauses we can.
+            _docMatcher->popOrClause();
+        }
+
+        CoveredIndexMatcher *nextClauseMatcher( const BSONObj &indexKeyPattern, bool alwaysUseRecord=false ) {
+            return new CoveredIndexMatcher( _docMatcher, indexKeyPattern, alwaysUseRecord );
+        }
+
+        string toString() const;
+
+    private:
+        bool matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 , bool keyUsable = true );
+        CoveredIndexMatcher(const shared_ptr< Matcher > &docMatcher, const BSONObj &indexKeyPattern , bool alwaysUseRecord=false );
+        void init( bool alwaysUseRecord );
+        shared_ptr< Matcher > _docMatcher;
+        Matcher _keyMatcher;
+
+        bool _needRecord; // if the key itself isn't good enough to determine a positive match
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/matcher_covered.cpp b/src/mongo/db/matcher_covered.cpp
new file mode 100644
index 00000000000..c6c89d03007
--- /dev/null
+++ b/src/mongo/db/matcher_covered.cpp
@@ -0,0 +1,101 @@
+// matcher_covered.cpp
+
+/* Matcher is our boolean expression evaluator for "where" clauses */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "matcher.h"
+#include "../util/goodies.h"
+#include "../util/unittest.h"
+#include "diskloc.h"
+#include "../scripting/engine.h"
+#include "db.h"
+#include "client.h"
+
+#include "pdfile.h"
+
+namespace mongo {
+
+    CoveredIndexMatcher::CoveredIndexMatcher( const BSONObj &jsobj, const BSONObj &indexKeyPattern, bool alwaysUseRecord) :
+        _docMatcher( new Matcher( jsobj ) ),
+        _keyMatcher( *_docMatcher, indexKeyPattern ) {
+        init( alwaysUseRecord );
+    }
+
+    CoveredIndexMatcher::CoveredIndexMatcher( const shared_ptr< Matcher > &docMatcher, const BSONObj &indexKeyPattern , bool alwaysUseRecord ) :
+        _docMatcher( docMatcher ),
+        _keyMatcher( *_docMatcher, indexKeyPattern ) {
+        init( alwaysUseRecord );
+    }
+
+    void CoveredIndexMatcher::init( bool alwaysUseRecord ) {
+        _needRecord =
+            alwaysUseRecord ||
+	        !_keyMatcher.keyMatch( *_docMatcher );
+    }
+
+    bool CoveredIndexMatcher::matchesCurrent( Cursor * cursor , MatchDetails * details ) {
+        // bool keyUsable = ! cursor->isMultiKey() && check for $orish like conditions in matcher SERVER-1264
+        return matches( cursor->currKey() , cursor->currLoc() , details ,
+                       !cursor->indexKeyPattern().isEmpty() // unindexed cursor
+                       && !cursor->isMultiKey() // multikey cursor
+                       );
+    }
+
+    bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details , bool keyUsable ) {
+
+        LOG(5) << "CoveredIndexMatcher::matches() " << key.toString() << ' ' << recLoc.toString() << ' ' << keyUsable << endl;
+
+        dassert( key.isValid() );
+
+        if ( details )
+            details->reset();
+
+        if ( keyUsable ) {
+            if ( !_keyMatcher.matches(key, details ) ) {
+                return false;
+            }
+            if ( ! _needRecord ) {
+                return true;
+            }
+        }
+
+        if ( details )
+            details->_loadedObject = true;
+
+        bool res = _docMatcher->matches(recLoc.obj() , details );
+        LOG(5) << "CoveredIndexMatcher _docMatcher->matches() returns " << res << endl;
+        return res;
+    }
+
+    string CoveredIndexMatcher::toString() const {
+        StringBuilder buf;
+        buf << "(CoveredIndexMatcher ";
+        
+        if ( _needRecord )
+            buf << "needRecord ";
+        
+        buf << "keyMatcher: " << _keyMatcher.toString() << " ";
+        
+        if ( _docMatcher )
+            buf << "docMatcher: " << _docMatcher->toString() << " ";
+        
+        buf << ")";
+        return buf.str();
+    }
+}
diff --git a/src/mongo/db/minilex.h b/src/mongo/db/minilex.h
new file mode 100644
index 00000000000..677514aa47c
--- /dev/null
+++ b/src/mongo/db/minilex.h
@@ -0,0 +1,164 @@
+// minilex.h
+// mini js lexical analyzer.  idea is to be dumb and fast.
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#error does anything use this?
+
+namespace mongo {
+
+#if defined(_WIN32)
+
+} // namespace mongo
+
+#include <hash_map>
+using namespace stdext;
+
+namespace mongo {
+
+    typedef const char * MyStr;
+    struct less_str {
+        bool operator()(const MyStr & x, const MyStr & y) const {
+            if ( strcmp(x, y) > 0)
+                return true;
+
+            return false;
+        }
+    };
+
+    typedef hash_map<const char*, int, hash_compare<const char *, less_str> > strhashmap;
+
+#else
+
+} // namespace mongo
+
+#include <ext/hash_map>
+
+namespace mongo {
+
+    using namespace __gnu_cxx;
+
+    typedef const char * MyStr;
+    struct eq_str {
+        bool operator()(const MyStr & x, const MyStr & y) const {
+            if ( strcmp(x, y) == 0)
+                return true;
+
+            return false;
+        }
+    };
+
+    typedef hash_map<const char*, int, hash<const char *>, eq_str > strhashmap;
+
+#endif
+
+    /*
+    struct MiniLexNotUsed {
+        strhashmap reserved;
+        bool ic[256]; // ic=Identifier Character
+        bool starter[256];
+
+        // dm: very dumb about comments and escaped quotes -- but we are faster then at least,
+        // albeit returning too much (which is ok for jsbobj current usage).
+        void grabVariables(char *code , strhashmap& vars) { // 'code' modified and must stay in scope*/
+    char *p = code;
+    char last = 0;
+    while ( *p ) {
+        if ( starter[*p] ) {
+            char *q = p+1;
+            while ( *q && ic[*q] ) q++;
+            const char *identifier = p;
+            bool done = *q == 0;
+            *q = 0;
+            if ( !reserved.count(identifier) ) {
+                // we try to be smart about 'obj' but have to be careful as obj.obj
+                // can happen; this is so that nFields is right for simplistic where cases
+                // so we can stop scanning in jsobj when we find the field of interest.
+                if ( strcmp(identifier,"obj")==0 && p>code && p[-1] != '.' )
+                    ;
+                else
+                    vars[identifier] = 1;
+            }
+            if ( done )
+                break;
+            p = q + 1;
+            continue;
+        }
+
+        if ( *p == '\'' ) {
+            p++;
+            while ( *p && *p != '\'' ) p++;
+        }
+        else if ( *p == '"' ) {
+            p++;
+            while ( *p && *p != '"' ) p++;
+        }
+        p++;
+    }
+}
+
+MiniLex() {
+    strhashmap atest;
+    atest["foo"] = 3;
+    assert( atest.count("bar") == 0 );
+    assert( atest.count("foo") == 1 );
+    assert( atest["foo"] == 3 );
+
+    for ( int i = 0; i < 256; i++ ) {
+        ic[i] = starter[i] = false;
+    }
+    for ( int i = 'a'; i <= 'z'; i++ )
+        ic[i] = starter[i] = true;
+    for ( int i = 'A'; i <= 'Z'; i++ )
+        ic[i] = starter[i] = true;
+    for ( int i = '0'; i <= '9'; i++ )
+        ic[i] = true;
+    for ( int i = 128; i < 256; i++ )
+        ic[i] = starter[i] = true;
+    ic['$'] = starter['$'] = true;
+    ic['_'] = starter['_'] = true;
+
+    reserved["break"] = true;
+    reserved["case"] = true;
+    reserved["catch"] = true;
+    reserved["continue"] = true;
+    reserved["default"] = true;
+    reserved["delete"] = true;
+    reserved["do"] = true;
+    reserved["else"] = true;
+    reserved["finally"] = true;
+    reserved["for"] = true;
+    reserved["function"] = true;
+    reserved["if"] = true;
+    reserved["in"] = true;
+    reserved["instanceof"] = true;
+    reserved["new"] = true;
+    reserved["return"] = true;
+    reserved["switch"] = true;
+    reserved["this"] = true;
+    reserved["throw"] = true;
+    reserved["try"] = true;
+    reserved["typeof"] = true;
+    reserved["var"] = true;
+    reserved["void"] = true;
+    reserved["while"] = true;
+    reserved["with "] = true;
+}
+};
+*/
+
+} // namespace mongo
diff --git a/src/mongo/db/module.cpp b/src/mongo/db/module.cpp
new file mode 100644
index 00000000000..4269c5e99a0
--- /dev/null
+++ b/src/mongo/db/module.cpp
@@ -0,0 +1,68 @@
+// module.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "pch.h"
+#include "module.h"
+
+namespace mongo {
+
+    std::list<Module*> * Module::_all;
+
+    Module::Module( const string& name )
+        : _name( name ) , _options( (string)"Module " + name + " options" ) {
+        if ( ! _all )
+            _all = new list<Module*>();
+        _all->push_back( this );
+    }
+
+    Module::~Module() {}
+
+    void Module::addOptions( boost::program_options::options_description& options ) {
+        if ( ! _all ) {
+            return;
+        }
+        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) {
+            Module* m = *i;
+            options.add( m->_options );
+        }
+    }
+
+    void Module::configAll( boost::program_options::variables_map& params ) {
+        if ( ! _all ) {
+            return;
+        }
+        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) {
+            Module* m = *i;
+            m->config( params );
+        }
+
+    }
+
+
+    void Module::initAll() {
+        if ( ! _all ) {
+            return;
+        }
+        for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) {
+            Module* m = *i;
+            m->init();
+        }
+
+    }
+
+}
diff --git a/src/mongo/db/module.h b/src/mongo/db/module.h
new file mode 100644
index 00000000000..71f276e0585
--- /dev/null
+++ b/src/mongo/db/module.h
@@ -0,0 +1,70 @@
+// module.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.info
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include <boost/program_options.hpp>
+#include <list>
+
+namespace mongo {
+
+    /**
+     * Module is the base class for adding modules to MongoDB
+     * modules allow adding hooks and features to mongo
+     * the idea is to add hooks into the main code for module support where needed
+     * some ideas are: monitoring, indexes, full text search
+     */
+    class Module {
+    public:
+        Module( const string& name );
+        virtual ~Module();
+
+        boost::program_options::options_description_easy_init add_options() {
+            return _options.add_options();
+        }
+
+        /**
+         * read config from command line
+         */
+        virtual void config( boost::program_options::variables_map& params ) = 0;
+
+        /**
+         * called after configuration when the server is ready start
+         */
+        virtual void init() = 0;
+
+        /**
+         * called when the database is about to shutdown
+         */
+        virtual void shutdown() = 0;
+
+        const string& getName() { return _name; }
+
+        // --- static things
+
+        static void addOptions( boost::program_options::options_description& options );
+        static void configAll( boost::program_options::variables_map& params );
+        static void initAll();
+
+    private:
+        static std::list<Module*> * _all;
+        string _name;
+        boost::program_options::options_description _options;
+    };
+}
diff --git a/src/mongo/db/modules/mms.cpp b/src/mongo/db/modules/mms.cpp
new file mode 100644
index 00000000000..418a553f283
--- /dev/null
+++ b/src/mongo/db/modules/mms.cpp
@@ -0,0 +1,170 @@
+// @file mms.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "pch.h"
+#include "../db.h"
+#include "../instance.h"
+#include "../module.h"
+#include "../../util/net/httpclient.h"
+#include "../../util/background.h"
+#include "../commands.h"
+
+namespace po = boost::program_options;
+
+namespace mongo {
+
+    /** Mongo Monitoring Service
+        if enabled, this runs in the background ands pings mss
+    */
+    class MMS : public BackgroundJob , Module {
+    public:
+
+        MMS()
+            : Module( "mms" ) , _baseurl( "" ) ,
+              _secsToSleep(1) , _token( "" ) , _name( "" ) {
+
+            add_options()
+            ( "mms-url" , po::value<string>()->default_value("http://mms.10gen.com/ping") , "url for mongo monitoring server" )
+            ( "mms-token" , po::value<string>() , "account token for mongo monitoring server" )
+            ( "mms-name" , po::value<string>() , "server name for mongo monitoring server" )
+            ( "mms-interval" , po::value<int>()->default_value(30) , "ping interval (in seconds) for mongo monitoring server" )
+            ;
+        }
+
+        ~MMS() {}
+
+        void config( boost::program_options::variables_map& params ) {
+            _baseurl = params["mms-url"].as<string>();
+            if ( params.count( "mms-token" ) ) {
+                _token = params["mms-token"].as<string>();
+            }
+            if ( params.count( "mms-name" ) ) {
+                _name = params["mms-name"].as<string>();
+            }
+            _secsToSleep = params["mms-interval"].as<int>();
+        }
+
+        void run() {
+            if ( _token.size() == 0  && _name.size() == 0 ) {
+                log(1) << "mms not configured" << endl;
+                return;
+            }
+
+            if ( _token.size() == 0 ) {
+                log() << "no token for mms - not running" << endl;
+                return;
+            }
+
+            if ( _name.size() == 0 ) {
+                log() << "no name for mms - not running" << endl;
+                return;
+            }
+
+            log() << "mms monitor staring...  token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl;
+            Client::initThread( "mms" );
+            Client& c = cc();
+
+
+            // TODO: using direct client is bad, but easy for now
+
+            while ( ! inShutdown() ) {
+                sleepsecs( _secsToSleep );
+
+                try {
+                    stringstream url;
+                    url << _baseurl << "?"
+                        << "token=" << _token << "&"
+                        << "name=" << _name << "&"
+                        << "ts=" << time(0)
+                        ;
+
+                    BSONObjBuilder bb;
+                    // duplicated so the post has everything
+                    bb.append( "token" , _token );
+                    bb.append( "name" , _name );
+                    bb.appendDate( "ts" , jsTime()  );
+
+                    // any commands
+                    _add( bb , "buildinfo" );
+                    _add( bb , "serverStatus" );
+
+                    BSONObj postData = bb.obj();
+
+                    log(1) << "mms url: " << url.str() << "\n\t post: " << postData << endl;;
+
+                    HttpClient c;
+                    HttpClient::Result r;
+                    int rc = c.post( url.str() , postData.jsonString() , &r );
+                    log(1) << "\t response code: " << rc << endl;
+                    if ( rc != 200 ) {
+                        log() << "mms error response code:" << rc << endl;
+                        log(1) << "mms error body:" << r.getEntireResponse() << endl;
+                    }
+                }
+                catch ( std::exception& e ) {
+                    log() << "mms exception: " << e.what() << endl;
+                }
+            }
+
+            c.shutdown();
+        }
+
+        void _add( BSONObjBuilder& postData , const char* cmd ) {
+            Command * c = Command::findCommand( cmd );
+            if ( ! c ) {
+                log() << "MMS can't find command: " << cmd << endl;
+                postData.append( cmd , "can't find command" );
+                return;
+            }
+
+            if ( c->locktype() ) {
+                log() << "MMS can only use noLocking commands not: " << cmd << endl;
+                postData.append( cmd , "not noLocking" );
+                return;
+            }
+
+            BSONObj co = BSON( cmd << 1 );
+
+            string errmsg;
+            BSONObjBuilder sub;
+            if ( ! c->run( "admin.$cmd" , co , 0 , errmsg , sub , false ) )
+                postData.append( cmd , errmsg );
+            else
+                postData.append( cmd , sub.obj() );
+        }
+
+
+        void init() { go(); }
+
+        void shutdown() {
+            // TODO
+        }
+
+    private:
+        string _baseurl;
+        int _secsToSleep;
+
+        string _token;
+        string _name;
+
+    } /*mms*/ ;
+
+}
+
+
+
diff --git a/src/mongo/db/mongo.ico b/src/mongo/db/mongo.ico
new file mode 100755
index 00000000000..5258b6e0446
--- /dev/null
+++ b/src/mongo/db/mongo.ico
diff --git a/src/mongo/db/mongommf.cpp b/src/mongo/db/mongommf.cpp
new file mode 100644
index 00000000000..af2e822404e
--- /dev/null
+++ b/src/mongo/db/mongommf.cpp
@@ -0,0 +1,339 @@
+// @file mongommf.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* this module adds some of our layers atop memory mapped files - specifically our handling of private views & such
+   if you don't care about journaling/durability (temp sort files & such) use MemoryMappedFile class, not this.
+*/
+
+#include "pch.h"
+#include "cmdline.h"
+#include "mongommf.h"
+#include "dur.h"
+#include "dur_journalformat.h"
+#include "../util/mongoutils/str.h"
+#include "mongomutex.h"
+#include "d_globals.h"
+
+using namespace mongoutils;
+
+namespace mongo {
+
+#if defined(_WIN32)
+    extern mutex mapViewMutex;
+
+    __declspec(noinline) void makeChunkWritable(size_t chunkno) { 
+        scoped_lock lk(mapViewMutex);
+
+        if( writable.get(chunkno) ) // double check lock
+            return;
+
+        // remap all maps in this chunk.  common case is a single map, but could have more than one with smallfiles or .ns files
+        size_t chunkStart = chunkno * MemoryMappedFile::ChunkSize;
+        size_t chunkNext = chunkStart + MemoryMappedFile::ChunkSize;
+
+        scoped_lock lk2(privateViews._mutex());
+        map<void*,MongoMMF*>::iterator i = privateViews.finditer_inlock((void*) (chunkNext-1));
+        while( 1 ) {
+            const pair<void*,MongoMMF*> x = *(--i);
+            MongoMMF *mmf = x.second;
+            if( mmf == 0 )
+                break;
+
+            size_t viewStart = (size_t) x.first;
+            size_t viewEnd = (size_t) (viewStart + mmf->length());
+            if( viewEnd <= chunkStart )
+                break;
+
+            size_t protectStart = max(viewStart, chunkStart);
+            dassert(protectStart<chunkNext);
+
+            size_t protectEnd = min(viewEnd, chunkNext);
+            size_t protectSize = protectEnd - protectStart;
+            dassert(protectSize>0&&protectSize<=MemoryMappedFile::ChunkSize);
+
+            DWORD old;
+            bool ok = VirtualProtect((void*)protectStart, protectSize, PAGE_WRITECOPY, &old);
+            if( !ok ) {
+                DWORD e = GetLastError();
+                log() << "VirtualProtect failed (mcw) " << mmf->filename() << ' ' << chunkno << hex << protectStart << ' ' << protectSize << ' ' << errnoWithDescription(e) << endl;
+                assert(false);
+            }
+        }
+
+        writable.set(chunkno);
+    }
+
+    void* MemoryMappedFile::createPrivateMap() {
+        assert( maphandle );
+        scoped_lock lk(mapViewMutex);
+        void *p = MapViewOfFile(maphandle, FILE_MAP_READ, 0, 0, 0);
+        if ( p == 0 ) {
+            DWORD e = GetLastError();
+            log() << "createPrivateMap failed " << filename() << " " << 
+                errnoWithDescription(e) << " filelen:" << len <<
+                ((sizeof(void*) == 4 ) ? " (32 bit build)" : "") <<
+                endl;
+        }
+        else {
+            clearWritableBits(p);
+            views.push_back(p);
+        }
+        return p;
+    }
+
+    void* MemoryMappedFile::remapPrivateView(void *oldPrivateAddr) {
+        d.dbMutex.assertWriteLocked(); // short window where we are unmapped so must be exclusive
+
+        // the mapViewMutex is to assure we get the same address on the remap
+        scoped_lock lk(mapViewMutex);
+
+        clearWritableBits(oldPrivateAddr);
+#if 1
+        // https://jira.mongodb.org/browse/SERVER-2942
+        DWORD old;
+        bool ok = VirtualProtect(oldPrivateAddr, (SIZE_T) len, PAGE_READONLY, &old);
+        if( !ok ) {
+            DWORD e = GetLastError();
+            log() << "VirtualProtect failed in remapPrivateView " << filename() << hex << oldPrivateAddr << ' ' << len << ' ' << errnoWithDescription(e) << endl;
+            assert(false);
+        }
+        return oldPrivateAddr;
+#else
+        if( !UnmapViewOfFile(oldPrivateAddr) ) {
+            DWORD e = GetLastError();
+            log() << "UnMapViewOfFile failed " << filename() << ' ' << errnoWithDescription(e) << endl;
+            assert(false);
+        }
+
+        // we want the new address to be the same as the old address in case things keep pointers around (as namespaceindex does).
+        void *p = MapViewOfFileEx(maphandle, FILE_MAP_READ, 0, 0,
+                                  /*dwNumberOfBytesToMap 0 means to eof*/0 /*len*/,
+                                  oldPrivateAddr);
+        
+        if ( p == 0 ) {
+            DWORD e = GetLastError();
+            log() << "MapViewOfFileEx failed " << filename() << " " << errnoWithDescription(e) << endl;
+            assert(p);
+        }
+        assert(p == oldPrivateAddr);
+        return p;
+#endif
+    }
+#endif
+
+    void MongoMMF::remapThePrivateView() {
+        assert( cmdLine.dur );
+
+        // todo 1.9 : it turns out we require that we always remap to the same address.
+        // so the remove / add isn't necessary and can be removed
+        privateViews.remove(_view_private);
+        _view_private = remapPrivateView(_view_private);
+        privateViews.add(_view_private, this);
+    }
+
+    /** register view. threadsafe */
+    void PointerToMMF::add(void *view, MongoMMF *f) {
+        assert(view);
+        assert(f);
+        mutex::scoped_lock lk(_m);
+        _views.insert( pair<void*,MongoMMF*>(view,f) );
+    }
+
+    /** de-register view. threadsafe */
+    void PointerToMMF::remove(void *view) {
+        if( view ) {
+            mutex::scoped_lock lk(_m);
+            _views.erase(view);
+        }
+    }
+
+    PointerToMMF::PointerToMMF() : _m("PointerToMMF") {
+#if defined(SIZE_MAX)
+        size_t max = SIZE_MAX;
+#else
+        size_t max = ~((size_t)0);
+#endif
+        assert( max > (size_t) this ); // just checking that no one redef'd SIZE_MAX and that it is sane
+
+        // this way we don't need any boundary checking in _find()
+        _views.insert( pair<void*,MongoMMF*>((void*)0,(MongoMMF*)0) );
+        _views.insert( pair<void*,MongoMMF*>((void*)max,(MongoMMF*)0) );
+    }
+
+    /** underscore version of find is for when you are already locked
+        @param ofs out return our offset in the view
+        @return the MongoMMF to which this pointer belongs
+    */
+    MongoMMF* PointerToMMF::find_inlock(void *p, /*out*/ size_t& ofs) {
+        //
+        // .................memory..........................
+        //    v1       p                      v2
+        //    [--------------------]          [-------]
+        //
+        // e.g., _find(p) == v1
+        //
+        const pair<void*,MongoMMF*> x = *(--_views.upper_bound(p));
+        MongoMMF *mmf = x.second;
+        if( mmf ) {
+            size_t o = ((char *)p) - ((char*)x.first);
+            if( o < mmf->length() ) {
+                ofs = o;
+                return mmf;
+            }
+        }
+        return 0;
+    }
+
+    /** find associated MMF object for a given pointer.
+        threadsafe
+        @param ofs out returns offset into the view of the pointer, if found.
+        @return the MongoMMF to which this pointer belongs. null if not found.
+    */
+    MongoMMF* PointerToMMF::find(void *p, /*out*/ size_t& ofs) {
+        mutex::scoped_lock lk(_m);
+        return find_inlock(p, ofs);
+    }
+
+    PointerToMMF privateViews;
+
+    /* void* MongoMMF::switchToPrivateView(void *readonly_ptr) {
+        assert( cmdLine.dur );
+        assert( testIntent );
+
+        void *p = readonly_ptr;
+
+        {
+            size_t ofs=0;
+            MongoMMF *mmf = ourReadViews.find(p, ofs);
+            if( mmf ) {
+                void *res = ((char *)mmf->_view_private) + ofs;
+                return res;
+            }
+        }
+
+        {
+            size_t ofs=0;
+            MongoMMF *mmf = privateViews.find(p, ofs);
+            if( mmf ) {
+                log() << "dur: perf warning p=" << p << " is already in the writable view of " << mmf->filename() << endl;
+                return p;
+            }
+        }
+
+        // did you call writing() with a pointer that isn't into a datafile?
+        log() << "dur error switchToPrivateView " << p << endl;
+        return p;
+    }*/
+
+    /* switch to _view_write.  normally, this is a bad idea since your changes will not
+       show up in _view_private if there have been changes there; thus the leading underscore
+       as a tad of a "warning".  but useful when done with some care, such as during
+       initialization.
+    */
+    void* MongoMMF::_switchToWritableView(void *p) {
+        size_t ofs;
+        MongoMMF *f = privateViews.find(p, ofs);
+        assert( f );
+        return (((char *)f->_view_write)+ofs);
+    }
+
+    extern string dbpath;
+
+    // here so that it is precomputed...
+    void MongoMMF::setPath(string f) {
+        string suffix;
+        string prefix;
+        bool ok = str::rSplitOn(f, '.', prefix, suffix);
+        uassert(13520, str::stream() << "MongoMMF only supports filenames in a certain format " << f, ok);
+        if( suffix == "ns" )
+            _fileSuffixNo = dur::JEntry::DotNsSuffix;
+        else
+            _fileSuffixNo = (int) str::toUnsigned(suffix);
+
+        _p = RelativePath::fromFullPath(prefix);
+    }
+
+    bool MongoMMF::open(string fname, bool sequentialHint) {
+        LOG(3) << "mmf open " << fname << endl;
+        setPath(fname);
+        _view_write = mapWithOptions(fname.c_str(), sequentialHint ? SEQUENTIAL : 0);
+        return finishOpening();
+    }
+
+    bool MongoMMF::create(string fname, unsigned long long& len, bool sequentialHint) {
+        LOG(3) << "mmf create " << fname << endl;
+        setPath(fname);
+        _view_write = map(fname.c_str(), len, sequentialHint ? SEQUENTIAL : 0);
+        return finishOpening();
+    }
+
+    bool MongoMMF::finishOpening() {
+        LOG(3) << "mmf finishOpening " << (void*) _view_write << ' ' << filename() << " len:" << length() << endl;
+        if( _view_write ) {
+            if( cmdLine.dur ) {
+                _view_private = createPrivateMap();
+                if( _view_private == 0 ) {
+                    msgasserted(13636, str::stream() << "file " << filename() << " open/create failed in createPrivateMap (look in log for more information)");
+                }
+                privateViews.add(_view_private, this); // note that testIntent builds use this, even though it points to view_write then...
+            }
+            else {
+                _view_private = _view_write;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    MongoMMF::MongoMMF() : _willNeedRemap(false) {
+        _view_write = _view_private = 0;
+    }
+
+    MongoMMF::~MongoMMF() {
+        try { 
+            close();
+        }
+        catch(...) { error() << "exception in ~MongoMMF" << endl; }
+    }
+
+    namespace dur {
+        void closingFileNotification();
+    }
+
+    /*virtual*/ void MongoMMF::close() {
+        LOG(3) << "mmf close " << filename() << endl;
+
+        if( view_write() /*actually was opened*/ ) {
+            if( cmdLine.dur ) {
+                dur::closingFileNotification();
+            }
+            if( !d.dbMutex.isWriteLocked() ) { 
+                assert( inShutdown() );
+                DEV { 
+                    log() << "is it really ok to close a mongommf outside a write lock? dbmutex status:" << d.dbMutex.getState() << " file:" << filename() << endl;
+                }
+            }
+        }
+
+        LockMongoFilesExclusive lk;
+        privateViews.remove(_view_private);
+        _view_write = _view_private = 0;
+        MemoryMappedFile::close();
+    }
+
+}
diff --git a/src/mongo/db/mongommf.h b/src/mongo/db/mongommf.h
new file mode 100644
index 00000000000..62a6cdfd3fd
--- /dev/null
+++ b/src/mongo/db/mongommf.h
@@ -0,0 +1,145 @@
+/** @file mongommf.h
+*
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../util/mmap.h"
+#include "../util/paths.h"
+
+namespace mongo {
+
+    /** MongoMMF adds some layers atop memory mapped files - specifically our handling of private views & such.
+        if you don't care about journaling/durability (temp sort files & such) use MemoryMappedFile class,
+        not this.
+    */
+    class MongoMMF : private MemoryMappedFile {
+    protected:
+        virtual void* viewForFlushing() { return _view_write; }
+
+    public:
+        MongoMMF();
+        virtual ~MongoMMF();
+        virtual void close();
+
+        /** @return true if opened ok. */
+        bool open(string fname, bool sequentialHint /*typically we open with this false*/);
+
+        /** @return file length */
+        unsigned long long length() const { return MemoryMappedFile::length(); }
+
+        string filename() const { return MemoryMappedFile::filename(); }
+
+        void flush(bool sync)   { MemoryMappedFile::flush(sync); }
+
+        /* Creates with length if DNE, otherwise uses existing file length,
+           passed length.
+           @param sequentialHint if true will be sequentially accessed
+           @return true for ok
+        */
+        bool create(string fname, unsigned long long& len, bool sequentialHint);
+
+        /* Get the "standard" view (which is the private one).
+           @return the private view.
+        */
+        void* getView() const { return _view_private; }
+        
+        /* Get the "write" view (which is required for writing).
+           @return the write view.
+        */
+        void* view_write() const { return _view_write; }
+
+
+        /* switch to _view_write.  normally, this is a bad idea since your changes will not
+           show up in _view_private if there have been changes there; thus the leading underscore
+           as a tad of a "warning".  but useful when done with some care, such as during
+           initialization.
+        */
+        static void* _switchToWritableView(void *private_ptr);
+
+        /** for a filename a/b/c.3
+            filePath() is "a/b/c"
+            fileSuffixNo() is 3
+            if the suffix is "ns", fileSuffixNo -1
+        */
+        const RelativePath& relativePath() const {
+            DEV assert( !_p._p.empty() );
+            return _p;
+        }
+
+        int fileSuffixNo() const { return _fileSuffixNo; }
+
+        /** true if we have written.
+            set in PREPLOGBUFFER, it is NOT set immediately on write intent declaration.
+            reset to false in REMAPPRIVATEVIEW
+        */
+        bool& willNeedRemap() { return _willNeedRemap; }
+
+        void remapThePrivateView();
+
+        virtual bool isMongoMMF() { return true; }
+
+    private:
+
+        void *_view_write;
+        void *_view_private;
+        bool _willNeedRemap;
+        RelativePath _p;   // e.g. "somepath/dbname"
+        int _fileSuffixNo;  // e.g. 3.  -1="ns"
+
+        void setPath(string pathAndFileName);
+        bool finishOpening();
+    };
+
+    /** for durability support we want to be able to map pointers to specific MongoMMF objects.
+    */
+    class PointerToMMF : boost::noncopyable {
+    public:
+        PointerToMMF();
+
+        /** register view.
+            threadsafe
+            */
+        void add(void *view, MongoMMF *f);
+
+        /** de-register view.
+            threadsafe
+            */
+        void remove(void *view);
+
+        /** find associated MMF object for a given pointer.
+            threadsafe
+            @param ofs out returns offset into the view of the pointer, if found.
+            @return the MongoMMF to which this pointer belongs. null if not found.
+        */
+        MongoMMF* find(void *p, /*out*/ size_t& ofs);
+
+        /** for doing many finds in a row with one lock operation */
+        mutex& _mutex() { return _m; }
+        MongoMMF* find_inlock(void *p, /*out*/ size_t& ofs);
+
+        map<void*,MongoMMF*>::iterator finditer_inlock(void *p) { return _views.upper_bound(p); }
+
+        unsigned numberOfViews_inlock() const { return _views.size(); }
+
+    private:
+        mutex _m;
+        map<void*, MongoMMF*> _views;
+    };
+
+    // allows a pointer into any private view of a MongoMMF to be resolved to the MongoMMF object
+    extern PointerToMMF privateViews;
+}
diff --git a/src/mongo/db/mongomutex.h b/src/mongo/db/mongomutex.h
new file mode 100644
index 00000000000..08b091cae9c
--- /dev/null
+++ b/src/mongo/db/mongomutex.h
@@ -0,0 +1,388 @@
+// @file mongomutex.h
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* Mutex heirarchy (1 = "leaf")
+     name                   level
+     Logstream::mutex       1
+     ClientCursor::ccmutex  2
+     dblock                 3
+
+     End func name with _inlock to indicate "caller must lock before calling".
+*/
+
+#pragma once
+
+#include "../util/concurrency/rwlock.h"
+#include "../util/mmap.h"
+#include "../util/time_support.h"
+#include "d_globals.h"
+
+namespace mongo {
+
+    class Client;
+    Client* curopWaitingForLock( int type );
+    void curopGotLock(Client*);
+
+    /* mongomutex time stats */
+    class MutexInfo {
+        unsigned long long enter, timeLocked; // microseconds
+        int locked;
+        unsigned long long start; // last as we touch this least often
+    public:
+        MutexInfo() : timeLocked(0) , locked(0) {
+            start = curTimeMicros64();
+        }
+        void entered() {
+            if ( locked == 0 )
+                enter = curTimeMicros64();
+            locked++;
+            assert( locked >= 1 );
+        }
+        void leaving() {
+            locked--;
+            assert( locked >= 0 );
+            if ( locked == 0 )
+                timeLocked += curTimeMicros64() - enter;
+        }
+        int isLocked() const { return locked; }
+        void getTimingInfo(unsigned long long &s, unsigned long long &tl) const {
+            s = start;
+            tl = timeLocked;
+        }
+        unsigned long long getTimeLocked() const { return timeLocked; }
+    };
+
+    /** the 'big lock'. a read/write lock.
+        there is one of these, d.dbMutex.
+
+        generally if you need to declare a mutex use the right primitive class, not this.
+
+        use readlock and writelock classes for scoped locks on this rather than direct
+        manipulation.
+       */
+    class MongoMutex {
+    public:
+        MongoMutex(const char * name);
+
+        /** @return
+         *    > 0  write lock
+         *    = 0  no lock
+         *    < 0  read lock
+         */
+        int getState() const { return _state.get(); }
+
+        bool atLeastReadLocked() const { return _state.get() != 0; }
+        void assertAtLeastReadLocked() const { assert(atLeastReadLocked()); }
+        bool isWriteLocked/*by our thread*/() const { return getState() > 0; }
+        void assertWriteLocked() const {
+            assert( getState() > 0 );
+            DEV assert( !_releasedEarly.get() );
+        }
+
+        // write lock.  use the writelock scoped lock class, not this directly.
+        void lock() {
+            if ( _writeLockedAlready() )
+                return;
+
+            _state.set(1);
+
+            curopWaitingForLock( 1 ); // stats
+            _m.lock();
+            MongoFile::markAllWritable(); // for _DEBUG validation -- a no op for release build
+            _acquiredWriteLock();
+        }
+
+        // try write lock
+        bool lock_try( int millis ) {
+            if ( _writeLockedAlready() ) // adjusts _state
+                return true;
+
+            curopWaitingForLock( 1 );
+            bool got = _m.lock_try( millis );
+
+            if ( got ) {
+                _state.set(1);
+                MongoFile::markAllWritable(); // for _DEBUG validation -- a no op for release build
+                _acquiredWriteLock();
+            }
+
+            return got;
+        }
+
+        // un write lock
+        void unlock() {
+            int s = _state.get();
+            if( s > 1 ) {
+                _state.set(s-1); // recursive lock case
+                return;
+            }
+            if( s != 1 ) {
+                if( _releasedEarly.get() ) {
+                    _releasedEarly.set(false);
+                    return;
+                }
+                massert( 12599, "internal error: attempt to unlock when wasn't in a write lock", false);
+            }
+            _releasingWriteLock();
+            MongoFile::unmarkAllWritable(); // _DEBUG validation
+            _state.set(0);
+            _m.unlock();
+        }
+
+        /* unlock (write lock), and when unlock() is called later,
+           be smart then and don't unlock it again.
+           */
+        void releaseEarly() {
+            assert( getState() == 1 ); // must not be recursive
+            assert( !_releasedEarly.get() );
+            _releasedEarly.set(true);
+            unlock();
+        }
+
+        // read lock. don't call directly, use readlock.
+        void lock_shared() {
+            int s = _state.get();
+            if( s ) {
+                if( s > 0 ) {
+                    // already in write lock - just be recursive and stay write locked
+                    _state.set(s+1);
+                }
+                else {
+                    // already in read lock - recurse
+                    _state.set(s-1);
+                }
+            }
+            else {
+                _state.set(-1);
+                Client *c = curopWaitingForLock( -1 );
+                _m.lock_shared();
+                curopGotLock(c);
+            }
+        }
+
+        // try read lock
+        bool lock_shared_try( int millis ) {
+            int s = _state.get();
+            if ( s ) {
+                // we already have a lock, so no need to try
+                lock_shared();
+                return true;
+            }
+
+            /* [dm] should there be
+                             Client *c = curopWaitingForLock( 1 );
+               here?  i think so.  seems to be missing.
+               */
+            bool got = _m.lock_shared_try( millis );
+            if ( got )
+                _state.set(-1);
+            return got;
+        }
+
+        void unlock_shared() {
+            int s = _state.get();
+            if( s > 0 ) {
+                wassert( s > 1 ); /* we must have done a lock write first to have s > 1 */
+                _state.set(s-1);
+                return;
+            }
+            if( s < -1 ) {
+                _state.set(s+1);
+                return;
+            }
+            wassert( s == -1 );
+            _state.set(0);
+            _m.unlock_shared();
+        }
+
+        MutexInfo& info() { return _minfo; }
+
+    private:
+        void lockedExclusively();
+        void unlockingExclusively();
+        void _acquiredWriteLock();
+        void _releasingWriteLock();
+
+        /* @return true if was already write locked.  increments recursive lock count. */
+        bool _writeLockedAlready();
+
+        RWLock _m;
+
+        /* > 0 write lock with recurse count
+           < 0 read lock
+        */
+        ThreadLocalValue<int> _state;
+
+        MutexInfo _minfo;
+
+    public:
+        // indicates we need to call dur::REMAPPRIVATEVIEW on the next write lock
+        bool _remapPrivateViewRequested;
+
+    private:
+        /* See the releaseEarly() method.
+           we use a separate TLS value for releasedEarly - that is ok as
+           our normal/common code path, we never even touch it */
+        ThreadLocalValue<bool> _releasedEarly;
+
+        /* this is for fsyncAndLock command.  otherwise write lock's greediness will
+           make us block on any attempted write lock the the fsync's lock.
+           */
+        //volatile bool _blockWrites;
+    };
+
+    namespace dur {
+        void REMAPPRIVATEVIEW();
+        void releasingWriteLock(); // because it's hard to include dur.h here
+    }
+
+    inline void MongoMutex::_releasingWriteLock() {
+        dur::releasingWriteLock();
+        unlockingExclusively();
+    }
+
+    inline void MongoMutex::_acquiredWriteLock() {
+        lockedExclusively();
+        if( _remapPrivateViewRequested ) {
+            dur::REMAPPRIVATEVIEW();
+            dassert( !_remapPrivateViewRequested );
+        }
+    }
+
+    string sayClientState();
+
+    /* @return true if was already write locked.  increments recursive lock count. */
+    inline bool MongoMutex::_writeLockedAlready() {
+        int s = _state.get();
+        if( s > 0 ) {
+            _state.set(s+1);
+            return true;
+        }
+        massert( 10293 , string("internal error: locks are not upgradeable: ") + sayClientState() , s == 0 );
+        return false;
+    }
+
+    struct writelock {
+        writelock() { d.dbMutex.lock(); }
+        writelock(const string& ns) { d.dbMutex.lock(); }
+        ~writelock() {
+            DESTRUCTOR_GUARD(
+                d.dbMutex.unlock();
+            );
+        }
+    };
+
+    struct readlock {
+        readlock(const string& ns) {
+            d.dbMutex.lock_shared();
+        }
+        readlock() { d.dbMutex.lock_shared(); }
+        ~readlock() {
+            DESTRUCTOR_GUARD(
+                d.dbMutex.unlock_shared();
+            );
+        }
+    };
+    struct readlocktry {
+        readlocktry( const string&ns , int tryms ) {
+            _got = d.dbMutex.lock_shared_try( tryms );
+        }
+        ~readlocktry() {
+            if ( _got ) {
+                d.dbMutex.unlock_shared();
+            }
+        }
+        bool got() const { return _got; }
+    private:
+        bool _got;
+    };
+
+    struct writelocktry {
+        writelocktry( const string&ns , int tryms ) {
+            _got = d.dbMutex.lock_try( tryms );
+        }
+        ~writelocktry() {
+            if ( _got ) {
+                d.dbMutex.unlock();
+            }
+        }
+        bool got() const { return _got; }
+    private:
+        bool _got;
+    };
+
+    struct readlocktryassert : public readlocktry {
+        readlocktryassert(const string& ns, int tryms) :
+            readlocktry(ns,tryms) {
+            uassert(13142, "timeout getting readlock", got());
+        }
+    };
+
+    /** assure we have at least a read lock - they key with this being
+        if you have a write lock, that's ok too.
+    */
+    struct atleastreadlock {
+        atleastreadlock( const string& ns = "" ) {
+            _prev = d.dbMutex.getState();
+            if ( _prev == 0 )
+                d.dbMutex.lock_shared();
+        }
+        ~atleastreadlock() {
+            if ( _prev == 0 )
+                d.dbMutex.unlock_shared();
+        }
+    private:
+        int _prev;
+    };
+
+    /* parameterized choice of read or write locking
+       use readlock and writelock instead of this when statically known which you want
+    */
+    class mongolock {
+        bool _writelock;
+    public:
+        mongolock(bool write) : _writelock(write) {
+            if( _writelock ) {
+                d.dbMutex.lock();
+            }
+            else
+                d.dbMutex.lock_shared();
+        }
+        ~mongolock() {
+            DESTRUCTOR_GUARD(
+            if( _writelock ) {
+                d.dbMutex.unlock();
+            }
+            else {
+                d.dbMutex.unlock_shared();
+            }
+            );
+        }
+        /* this unlocks, does NOT upgrade. that works for our current usage */
+        //void releaseAndWriteLock();
+    };
+
+    /* deprecated - use writelock and readlock instead */
+    struct dblock : public writelock {
+        dblock() : writelock("") { }
+    };
+
+    // eliminate this - we should just type "d.dbMutex.assertWriteLocked();" instead
+    inline void assertInWriteLock() { d.dbMutex.assertWriteLocked(); }
+
+}
diff --git a/src/mongo/db/namespace-inl.h b/src/mongo/db/namespace-inl.h
new file mode 100644
index 00000000000..a621a229546
--- /dev/null
+++ b/src/mongo/db/namespace-inl.h
@@ -0,0 +1,132 @@
+// @file namespace-inl.h
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "namespace.h"
+
+namespace mongo {
+
+    inline Namespace& Namespace::operator=(const char *ns) {
+        // we fill the remaining space with all zeroes here.  as the full Namespace struct is in
+        // the datafiles (the .ns files specifically), that is helpful as then they are deterministic
+        // in the bytes they have for a given sequence of operations.  that makes testing and debugging
+        // the data files easier.
+        //
+        // if profiling indicates this method is a significant bottleneck, we could have a version we
+        // use for reads which does not fill with zeroes, and keep the zeroing behavior on writes.
+        //
+        unsigned len = strlen(ns);
+        uassert( 10080 , "ns name too long, max size is 128", len < MaxNsLen);
+        memset(buf, 0, MaxNsLen);
+        memcpy(buf, ns, len);
+        return *this;
+    }
+
+    inline string Namespace::extraName(int i) const {
+        char ex[] = "$extra";
+        ex[5] += i;
+        string s = string(buf) + ex;
+        massert( 10348 , "$extra: ns name too long", s.size() < MaxNsLen);
+        return s;
+    }
+
+    inline bool Namespace::isExtra() const {
+        const char *p = strstr(buf, "$extr");
+        return p && p[5] && p[6] == 0; //==0 important in case an index uses name "$extra_1" for example
+    }
+
+    inline int Namespace::hash() const {
+        unsigned x = 0;
+        const char *p = buf;
+        while ( *p ) {
+            x = x * 131 + *p;
+            p++;
+        }
+        return (x & 0x7fffffff) | 0x8000000; // must be > 0
+    }
+
+    /* future : this doesn't need to be an inline. */
+    inline string Namespace::getSisterNS( const char * local ) const {
+        assert( local && local[0] != '.' );
+        string old(buf);
+        if ( old.find( "." ) != string::npos )
+            old = old.substr( 0 , old.find( "." ) );
+        return old + "." + local;
+    }
+
+    inline IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected ) {
+        if( idxNo < NIndexesBase ) {
+            IndexDetails& id = _indexes[idxNo];
+            return id;
+        }
+        Extra *e = extra();
+        if ( ! e ) {
+            if ( missingExpected )
+                throw MsgAssertionException( 13283 , "Missing Extra" );
+            massert(14045, "missing Extra", e);
+        }
+        int i = idxNo - NIndexesBase;
+        if( i >= NIndexesExtra ) {
+            e = e->next(this);
+            if ( ! e ) {
+                if ( missingExpected )
+                    throw MsgAssertionException( 14823 , "missing extra" );
+                massert(14824, "missing Extra", e);
+            }
+            i -= NIndexesExtra;
+        }
+        return e->details[i];
+    }
+
+    inline int NamespaceDetails::idxNo(IndexDetails& idx) {
+        IndexIterator i = ii();
+        while( i.more() ) {
+            if( &i.next() == &idx )
+                return i.pos()-1;
+        }
+        massert( 10349 , "E12000 idxNo fails", false);
+        return -1;
+    }
+
+    inline int NamespaceDetails::findIndexByKeyPattern(const BSONObj& keyPattern) {
+        IndexIterator i = ii();
+        while( i.more() ) {
+            if( i.next().keyPattern() == keyPattern )
+                return i.pos()-1;
+        }
+        return -1;
+    }
+
+    // @return offset in indexes[]
+    inline int NamespaceDetails::findIndexByName(const char *name) {
+        IndexIterator i = ii();
+        while( i.more() ) {
+            if ( strcmp(i.next().info.obj().getStringField("name"),name) == 0 )
+                return i.pos()-1;
+        }
+        return -1;
+    }
+
+    inline NamespaceDetails::IndexIterator::IndexIterator(NamespaceDetails *_d) {
+        d = _d;
+        i = 0;
+        n = d->nIndexes;
+    }
+
+}
diff --git a/src/mongo/db/namespace.cpp b/src/mongo/db/namespace.cpp
new file mode 100644
index 00000000000..af8b5694248
--- /dev/null
+++ b/src/mongo/db/namespace.cpp
@@ -0,0 +1,800 @@
+// namespace.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "db.h"
+#include "mongommf.h"
+#include "../util/hashtab.h"
+#include "../scripting/engine.h"
+#include "btree.h"
+#include <algorithm>
+#include <list>
+#include "queryutil.h"
+#include "json.h"
+#include "ops/delete.h"
+#include "ops/query.h"
+
+namespace mongo {
+
+    BOOST_STATIC_ASSERT( sizeof(Namespace) == 128 );
+
+    BSONObj idKeyPattern = fromjson("{\"_id\":1}");
+
+    /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes
+       so you can look for a deleterecord about the right size.
+    */
+    int bucketSizes[] = {
+        32, 64, 128, 256, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000,
+        0x8000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000,
+        0x400000, 0x800000
+    };
+
+    NamespaceDetails::NamespaceDetails( const DiskLoc &loc, bool _capped ) {
+        /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */
+        firstExtent = lastExtent = capExtent = loc;
+        stats.datasize = stats.nrecords = 0;
+        lastExtentSize = 0;
+        nIndexes = 0;
+        capped = _capped;
+        max = 0x7fffffff;
+        paddingFactor = 1.0;
+        flags = 0;
+        capFirstNewRecord = DiskLoc();
+        // Signal that we are on first allocation iteration through extents.
+        capFirstNewRecord.setInvalid();
+        // For capped case, signal that we are doing initial extent allocation.
+        if ( capped )
+            cappedLastDelRecLastExtent().setInvalid();
+        assert( sizeof(dataFileVersion) == 2 );
+        dataFileVersion = 0;
+        indexFileVersion = 0;
+        multiKeyIndexBits = 0;
+        reservedA = 0;
+        extraOffset = 0;
+        indexBuildInProgress = 0;
+        reservedB = 0;
+        capped2.cc2_ptr = 0;
+        capped2.fileNumber = 0;
+        memset(reserved, 0, sizeof(reserved));
+    }
+
+    bool NamespaceIndex::exists() const {
+        return !MMF::exists(path());
+    }
+
+    boost::filesystem::path NamespaceIndex::path() const {
+        boost::filesystem::path ret( dir_ );
+        if ( directoryperdb )
+            ret /= database_;
+        ret /= ( database_ + ".ns" );
+        return ret;
+    }
+
+    void NamespaceIndex::maybeMkdir() const {
+        if ( !directoryperdb )
+            return;
+        boost::filesystem::path dir( dir_ );
+        dir /= database_;
+        if ( !boost::filesystem::exists( dir ) )
+            MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( boost::filesystem::create_directory( dir ), "create dir for db " );
+    }
+
+    unsigned lenForNewNsFiles = 16 * 1024 * 1024;
+
+#if defined(_DEBUG)
+    void NamespaceDetails::dump(const Namespace& k) {
+        if( !cmdLine.dur )
+            cout << "ns offsets which follow will not display correctly with --journal disabled" << endl;
+
+        size_t ofs = 1; // 1 is sentinel that the find call below failed
+        privateViews.find(this, /*out*/ofs);
+
+        cout << "ns" << hex << setw(8) << ofs << ' ';
+        cout << k.toString() << '\n';
+
+        if( k.isExtra() ) {
+            cout << "ns\t extra" << endl;
+            return;
+        }
+
+        cout << "ns         " << firstExtent.toString() << ' ' << lastExtent.toString() << " nidx:" << nIndexes << '\n';
+        cout << "ns         " << stats.datasize << ' ' << stats.nrecords << ' ' << nIndexes << '\n';
+        cout << "ns         " << capped << ' ' << paddingFactor << ' ' << flags << ' ' << dataFileVersion << '\n';
+        cout << "ns         " << multiKeyIndexBits << ' ' << indexBuildInProgress << '\n';
+        cout << "ns         " << (int) reserved[0] << ' ' << (int) reserved[59];
+        cout << endl;
+    }
+#endif
+
+    void NamespaceDetails::onLoad(const Namespace& k) {
+
+        if( k.isExtra() ) {
+            /* overflow storage for indexes - so don't treat as a NamespaceDetails object. */
+            return;
+        }
+
+        if( indexBuildInProgress || capped2.cc2_ptr ) {
+            assertInWriteLock();
+            if( indexBuildInProgress ) {
+                log() << "indexBuildInProgress was " << indexBuildInProgress << " for " << k << ", indicating an abnormal db shutdown" << endl;
+                getDur().writingInt( indexBuildInProgress ) = 0;
+            }
+            if( capped2.cc2_ptr )
+                *getDur().writing(&capped2.cc2_ptr) = 0;
+        }
+    }
+
+    static void namespaceOnLoadCallback(const Namespace& k, NamespaceDetails& v) {
+        v.onLoad(k);
+    }
+
+    bool checkNsFilesOnLoad = true;
+
+    NOINLINE_DECL void NamespaceIndex::_init() {
+        assert( !ht );
+
+        d.dbMutex.assertWriteLocked();
+
+        /* if someone manually deleted the datafiles for a database,
+           we need to be sure to clear any cached info for the database in
+           local.*.
+        */
+        /*
+        if ( "local" != database_ ) {
+            DBInfo i(database_.c_str());
+            i.dbDropped();
+        }
+        */
+
+        unsigned long long len = 0;
+        boost::filesystem::path nsPath = path();
+        string pathString = nsPath.string();
+        void *p = 0;
+        if( MMF::exists(nsPath) ) {
+            if( f.open(pathString, true) ) {
+                len = f.length();
+                if ( len % (1024*1024) != 0 ) {
+                    log() << "bad .ns file: " << pathString << endl;
+                    uassert( 10079 ,  "bad .ns file length, cannot open database", len % (1024*1024) == 0 );
+                }
+                p = f.getView();
+            }
+        }
+        else {
+            // use lenForNewNsFiles, we are making a new database
+            massert( 10343, "bad lenForNewNsFiles", lenForNewNsFiles >= 1024*1024 );
+            maybeMkdir();
+            unsigned long long l = lenForNewNsFiles;
+            if( f.create(pathString, l, true) ) {
+                getDur().createdFile(pathString, l); // always a new file
+                len = l;
+                assert( len == lenForNewNsFiles );
+                p = f.getView();
+            }
+        }
+
+        if ( p == 0 ) {
+            /** TODO: this shouldn't terminate? */
+            log() << "error couldn't open file " << pathString << " terminating" << endl;
+            dbexit( EXIT_FS );
+        }
+
+
+        assert( len <= 0x7fffffff );
+        ht = new HashTable<Namespace,NamespaceDetails>(p, (int) len, "namespace index");
+        if( checkNsFilesOnLoad )
+            ht->iterAll(namespaceOnLoadCallback);
+    }
+
+    static void namespaceGetNamespacesCallback( const Namespace& k , NamespaceDetails& v , void * extra ) {
+        list<string> * l = (list<string>*)extra;
+        if ( ! k.hasDollarSign() )
+            l->push_back( (string)k );
+    }
+    void NamespaceIndex::getNamespaces( list<string>& tofill , bool onlyCollections ) const {
+        assert( onlyCollections ); // TODO: need to implement this
+        //                                  need boost::bind or something to make this less ugly
+
+        if ( ht )
+            ht->iterAll( namespaceGetNamespacesCallback , (void*)&tofill );
+    }
+
+    void NamespaceDetails::addDeletedRec(DeletedRecord *d, DiskLoc dloc) {
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) );
+
+        {
+            Record *r = (Record *) getDur().writingPtr(d, sizeof(Record));
+            d = &r->asDeleted();
+            // defensive code: try to make us notice if we reference a deleted record
+            (unsigned&) (r->data) = 0xeeeeeeee;
+        }
+        DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs << endl;
+        if ( capped ) {
+            if ( !cappedLastDelRecLastExtent().isValid() ) {
+                // Initial extent allocation.  Insert at end.
+                d->nextDeleted = DiskLoc();
+                if ( cappedListOfAllDeletedRecords().isNull() )
+                    getDur().writingDiskLoc( cappedListOfAllDeletedRecords() ) = dloc;
+                else {
+                    DiskLoc i = cappedListOfAllDeletedRecords();
+                    for (; !i.drec()->nextDeleted.isNull(); i = i.drec()->nextDeleted )
+                        ;
+                    i.drec()->nextDeleted.writing() = dloc;
+                }
+            }
+            else {
+                d->nextDeleted = cappedFirstDeletedInCurExtent();
+                getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = dloc;
+                // always compact() after this so order doesn't matter
+            }
+        }
+        else {
+            int b = bucket(d->lengthWithHeaders);
+            DiskLoc& list = deletedList[b];
+            DiskLoc oldHead = list;
+            getDur().writingDiskLoc(list) = dloc;
+            d->nextDeleted = oldHead;
+        }
+    }
+
+    /* predetermine location of the next alloc without actually doing it. 
+        if cannot predetermine returns null (so still call alloc() then)
+    */
+    DiskLoc NamespaceDetails::allocWillBeAt(const char *ns, int lenToAlloc) {
+        if ( !capped ) {
+            lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
+            return __stdAlloc(lenToAlloc, true);
+        }
+        return DiskLoc();
+    }
+
+    /** allocate space for a new record from deleted lists.
+        @param lenToAlloc is WITH header
+        @param extentLoc OUT returns the extent location
+        @return null diskloc if no room - allocate a new extent then
+    */
+    DiskLoc NamespaceDetails::alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc) {
+        {
+            // align very slightly.  
+            // note that if doing more coarse-grained quantization (really just if it isn't always
+            //   a constant amount but if it varied by record size) then that quantization should 
+            //   NOT be done here but rather in __stdAlloc so that we can grab a deletedrecord that 
+            //   is just big enough if we happen to run into one.
+            lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
+        }
+
+        DiskLoc loc = _alloc(ns, lenToAlloc);
+        if ( loc.isNull() )
+            return loc;
+
+        const DeletedRecord *r = loc.drec();
+        //r = getDur().writing(r);
+
+        /* note we want to grab from the front so our next pointers on disk tend
+        to go in a forward direction which is important for performance. */
+        int regionlen = r->lengthWithHeaders;
+        extentLoc.set(loc.a(), r->extentOfs);
+        assert( r->extentOfs < loc.getOfs() );
+
+        DEBUGGING out() << "TEMP: alloc() returns " << loc.toString() << ' ' << ns << " lentoalloc:" << lenToAlloc << " ext:" << extentLoc.toString() << endl;
+
+        int left = regionlen - lenToAlloc;
+        if ( capped == 0 ) {
+            if ( left < 24 || left < (lenToAlloc >> 3) ) {
+                // you get the whole thing.
+                return loc;
+            }
+        }
+
+        /* split off some for further use. */
+        getDur().writingInt(r->lengthWithHeaders) = lenToAlloc;
+        DiskLoc newDelLoc = loc;
+        newDelLoc.inc(lenToAlloc);
+        DeletedRecord *newDel = DataFileMgr::makeDeletedRecord(newDelLoc, left);
+        DeletedRecord *newDelW = getDur().writing(newDel);
+        newDelW->extentOfs = r->extentOfs;
+        newDelW->lengthWithHeaders = left;
+        newDelW->nextDeleted.Null();
+
+        addDeletedRec(newDel, newDelLoc);
+
+        return loc;
+    }
+
+    /* for non-capped collections.
+       @param peekOnly just look up where and don't reserve
+       returned item is out of the deleted list upon return
+    */
+    DiskLoc NamespaceDetails::__stdAlloc(int len, bool peekOnly) {
+        DiskLoc *prev;
+        DiskLoc *bestprev = 0;
+        DiskLoc bestmatch;
+        int bestmatchlen = 0x7fffffff;
+        int b = bucket(len);
+        DiskLoc cur = deletedList[b];
+        prev = &deletedList[b];
+        int extra = 5; // look for a better fit, a little.
+        int chain = 0;
+        while ( 1 ) {
+            {
+                int a = cur.a();
+                if ( a < -1 || a >= 100000 ) {
+                    problem() << "~~ Assertion - cur out of range in _alloc() " << cur.toString() <<
+                              " a:" << a << " b:" << b << " chain:" << chain << '\n';
+                    sayDbContext();
+                    if ( cur == *prev )
+                        prev->Null();
+                    cur.Null();
+                }
+            }
+            if ( cur.isNull() ) {
+                // move to next bucket.  if we were doing "extra", just break
+                if ( bestmatchlen < 0x7fffffff )
+                    break;
+                b++;
+                if ( b > MaxBucket ) {
+                    // out of space. alloc a new extent.
+                    return DiskLoc();
+                }
+                cur = deletedList[b];
+                prev = &deletedList[b];
+                continue;
+            }
+            DeletedRecord *r = cur.drec();
+            if ( r->lengthWithHeaders >= len &&
+                    r->lengthWithHeaders < bestmatchlen ) {
+                bestmatchlen = r->lengthWithHeaders;
+                bestmatch = cur;
+                bestprev = prev;
+            }
+            if ( bestmatchlen < 0x7fffffff && --extra <= 0 )
+                break;
+            if ( ++chain > 30 && b < MaxBucket ) {
+                // too slow, force move to next bucket to grab a big chunk
+                //b++;
+                chain = 0;
+                cur.Null();
+            }
+            else {
+                /*this defensive check only made sense for the mmap storage engine:
+                  if ( r->nextDeleted.getOfs() == 0 ) {
+                    problem() << "~~ Assertion - bad nextDeleted " << r->nextDeleted.toString() <<
+                    " b:" << b << " chain:" << chain << ", fixing.\n";
+                    r->nextDeleted.Null();
+                }*/
+                cur = r->nextDeleted;
+                prev = &r->nextDeleted;
+            }
+        }
+
+        /* unlink ourself from the deleted list */
+        if( !peekOnly ) {
+            const DeletedRecord *bmr = bestmatch.drec();
+            *getDur().writing(bestprev) = bmr->nextDeleted;
+            bmr->nextDeleted.writing().setInvalid(); // defensive.
+            assert(bmr->extentOfs < bestmatch.getOfs());
+        }
+
+        return bestmatch;
+    }
+
+    void NamespaceDetails::dumpDeleted(set<DiskLoc> *extents) {
+        for ( int i = 0; i < Buckets; i++ ) {
+            DiskLoc dl = deletedList[i];
+            while ( !dl.isNull() ) {
+                DeletedRecord *r = dl.drec();
+                DiskLoc extLoc(dl.a(), r->extentOfs);
+                if ( extents == 0 || extents->count(extLoc) <= 0 ) {
+                    out() << "  bucket " << i << endl;
+                    out() << "   " << dl.toString() << " ext:" << extLoc.toString();
+                    if ( extents && extents->count(extLoc) <= 0 )
+                        out() << '?';
+                    out() << " len:" << r->lengthWithHeaders << endl;
+                }
+                dl = r->nextDeleted;
+            }
+        }
+    }
+
+    DiskLoc NamespaceDetails::firstRecord( const DiskLoc &startExtent ) const {
+        for (DiskLoc i = startExtent.isNull() ? firstExtent : startExtent;
+                !i.isNull(); i = i.ext()->xnext ) {
+            if ( !i.ext()->firstRecord.isNull() )
+                return i.ext()->firstRecord;
+        }
+        return DiskLoc();
+    }
+
+    DiskLoc NamespaceDetails::lastRecord( const DiskLoc &startExtent ) const {
+        for (DiskLoc i = startExtent.isNull() ? lastExtent : startExtent;
+                !i.isNull(); i = i.ext()->xprev ) {
+            if ( !i.ext()->lastRecord.isNull() )
+                return i.ext()->lastRecord;
+        }
+        return DiskLoc();
+    }
+
+    int n_complaints_cap = 0;
+    void NamespaceDetails::maybeComplain( const char *ns, int len ) const {
+        if ( ++n_complaints_cap < 8 ) {
+            out() << "couldn't make room for new record (len: " << len << ") in capped ns " << ns << '\n';
+            int i = 0;
+            for ( DiskLoc e = firstExtent; !e.isNull(); e = e.ext()->xnext, ++i ) {
+                out() << "  Extent " << i;
+                if ( e == capExtent )
+                    out() << " (capExtent)";
+                out() << '\n';
+                out() << "    magic: " << hex << e.ext()->magic << dec << " extent->ns: " << e.ext()->nsDiagnostic.toString() << '\n';
+                out() << "    fr: " << e.ext()->firstRecord.toString() <<
+                      " lr: " << e.ext()->lastRecord.toString() << " extent->len: " << e.ext()->length << '\n';
+            }
+            assert( len * 5 > lastExtentSize ); // assume it is unusually large record; if not, something is broken
+        }
+    }
+
+    /* alloc with capped table handling. */
+    DiskLoc NamespaceDetails::_alloc(const char *ns, int len) {
+        if ( !capped )
+            return __stdAlloc(len, false);
+
+        return cappedAlloc(ns,len);
+    }
+
+    void NamespaceIndex::kill_ns(const char *ns) {
+        d.dbMutex.assertWriteLocked();
+        if ( !ht )
+            return;
+        Namespace n(ns);
+        ht->kill(n);
+
+        for( int i = 0; i<=1; i++ ) {
+            try {
+                Namespace extra(n.extraName(i).c_str());
+                ht->kill(extra);
+            }
+            catch(DBException&) { 
+                dlog(3) << "caught exception in kill_ns" << endl;
+            }
+        }
+    }
+
+    void NamespaceIndex::add_ns(const char *ns, DiskLoc& loc, bool capped) {
+        NamespaceDetails details( loc, capped );
+        add_ns( ns, details );
+    }
+    void NamespaceIndex::add_ns( const char *ns, const NamespaceDetails &details ) {
+        d.dbMutex.assertWriteLocked();
+        init();
+        Namespace n(ns);
+        uassert( 10081 , "too many namespaces/collections", ht->put(n, details));
+    }
+
+    /* extra space for indexes when more than 10 */
+    NamespaceDetails::Extra* NamespaceIndex::newExtra(const char *ns, int i, NamespaceDetails *d) {
+        mongo::d.dbMutex.assertWriteLocked();
+        assert( i >= 0 && i <= 1 );
+        Namespace n(ns);
+        Namespace extra(n.extraName(i).c_str()); // throws userexception if ns name too long
+
+        massert( 10350 ,  "allocExtra: base ns missing?", d );
+        massert( 10351 ,  "allocExtra: extra already exists", ht->get(extra) == 0 );
+
+        NamespaceDetails::Extra temp;
+        temp.init();
+        uassert( 10082 ,  "allocExtra: too many namespaces/collections", ht->put(extra, (NamespaceDetails&) temp));
+        NamespaceDetails::Extra *e = (NamespaceDetails::Extra *) ht->get(extra);
+        return e;
+    }
+    NamespaceDetails::Extra* NamespaceDetails::allocExtra(const char *ns, int nindexessofar) {
+        NamespaceIndex *ni = nsindex(ns);
+        int i = (nindexessofar - NIndexesBase) / NIndexesExtra;
+        Extra *e = ni->newExtra(ns, i, this);
+        long ofs = e->ofsFrom(this);
+        if( i == 0 ) {
+            assert( extraOffset == 0 );
+            *getDur().writing(&extraOffset) = ofs;
+            assert( extra() == e );
+        }
+        else {
+            Extra *hd = extra();
+            assert( hd->next(this) == 0 );
+            hd->setNext(ofs);
+        }
+        return e;
+    }
+
+    /* you MUST call when adding an index.  see pdfile.cpp */
+    IndexDetails& NamespaceDetails::addIndex(const char *thisns, bool resetTransient) {
+        IndexDetails *id;
+        try {
+            id = &idx(nIndexes,true);
+        }
+        catch(DBException&) {
+            allocExtra(thisns, nIndexes);
+            id = &idx(nIndexes,false);
+        }
+
+        (*getDur().writing(&nIndexes))++;
+        if ( resetTransient )
+            NamespaceDetailsTransient::get(thisns).addedIndex();
+        return *id;
+    }
+
+    // must be called when renaming a NS to fix up extra
+    void NamespaceDetails::copyingFrom(const char *thisns, NamespaceDetails *src) {
+        extraOffset = 0; // we are a copy -- the old value is wrong.  fixing it up below.
+        Extra *se = src->extra();
+        int n = NIndexesBase;
+        if( se ) {
+            Extra *e = allocExtra(thisns, n);
+            while( 1 ) {
+                n += NIndexesExtra;
+                e->copy(this, *se);
+                se = se->next(src);
+                if( se == 0 ) break;
+                Extra *nxt = allocExtra(thisns, n);
+                e->setNext( nxt->ofsFrom(this) );
+                e = nxt;
+            }
+            assert( extraOffset );
+        }
+    }
+
+    /* returns index of the first index in which the field is present. -1 if not present.
+       (aug08 - this method not currently used)
+    */
+    int NamespaceDetails::fieldIsIndexed(const char *fieldName) {
+        massert( 10346 , "not implemented", false);
+        /*
+        for ( int i = 0; i < nIndexes; i++ ) {
+            IndexDetails& idx = indexes[i];
+            BSONObj idxKey = idx.info.obj().getObjectField("key"); // e.g., { ts : -1 }
+            if ( !idxKey.getField(fieldName).eoo() )
+                return i;
+        }*/
+        return -1;
+    }
+
+    long long NamespaceDetails::storageSize( int * numExtents , BSONArrayBuilder * extentInfo ) const {
+        Extent * e = firstExtent.ext();
+        assert( e );
+
+        long long total = 0;
+        int n = 0;
+        while ( e ) {
+            total += e->length;
+            n++;
+
+            if ( extentInfo ) {
+                extentInfo->append( BSON( "len" << e->length << "loc: " << e->myLoc.toBSONObj() ) );
+            }
+
+            e = e->getNextExtent();
+        }
+
+        if ( numExtents )
+            *numExtents = n;
+
+        return total;
+    }
+
+    NamespaceDetails *NamespaceDetails::writingWithExtra() {
+        vector< pair< long long, unsigned > > writeRanges;
+        writeRanges.push_back( make_pair( 0, sizeof( NamespaceDetails ) ) );
+        for( Extra *e = extra(); e; e = e->next( this ) ) {
+            writeRanges.push_back( make_pair( (char*)e - (char*)this, sizeof( Extra ) ) );
+        }
+        return reinterpret_cast< NamespaceDetails* >( getDur().writingRangesAtOffsets( this, writeRanges ) );
+    }
+
+    /* ------------------------------------------------------------------------- */
+
+    SimpleMutex NamespaceDetailsTransient::_qcMutex("qc");
+    SimpleMutex NamespaceDetailsTransient::_isMutex("is");
+    map< string, shared_ptr< NamespaceDetailsTransient > > NamespaceDetailsTransient::_nsdMap;
+    typedef map< string, shared_ptr< NamespaceDetailsTransient > >::iterator ouriter;
+
+    void NamespaceDetailsTransient::reset() {
+        DEV assertInWriteLock();
+        clearQueryCache();
+        _keysComputed = false;
+        _indexSpecs.clear();
+    }
+
+    /*static*/ NOINLINE_DECL NamespaceDetailsTransient& NamespaceDetailsTransient::make_inlock(const char *ns) {
+        shared_ptr< NamespaceDetailsTransient > &t = _nsdMap[ ns ];
+        assert( t.get() == 0 );
+        Database *database = cc().database();
+        assert( database );
+        if( _nsdMap.size() % 20000 == 10000 ) { 
+            // so we notice if insanely large #s
+            log() << "opening namespace " << ns << endl;
+            log() << _nsdMap.size() << " namespaces in nsdMap" << endl;
+        }
+        t.reset( new NamespaceDetailsTransient(database, ns) );
+        return *t;
+    }
+
+    // note with repair there could be two databases with the same ns name.
+    // that is NOT handled here yet!  TODO
+    // repair may not use nsdt though not sure.  anyway, requires work.
+    NamespaceDetailsTransient::NamespaceDetailsTransient(Database *db, const char *ns) : 
+        _ns(ns), _keysComputed(false), _qcWriteCount() 
+    {
+        dassert(db);
+    }
+
+    NamespaceDetailsTransient::~NamespaceDetailsTransient() { 
+    }
+    
+    void NamespaceDetailsTransient::clearForPrefix(const char *prefix) {
+        assertInWriteLock();
+        vector< string > found;
+        for( ouriter i = _nsdMap.begin(); i != _nsdMap.end(); ++i )
+            if ( strncmp( i->first.c_str(), prefix, strlen( prefix ) ) == 0 )
+                found.push_back( i->first );
+        for( vector< string >::iterator i = found.begin(); i != found.end(); ++i ) {
+            _nsdMap[ *i ].reset();
+        }
+    }
+
+    void NamespaceDetailsTransient::eraseForPrefix(const char *prefix) {
+        assertInWriteLock();
+        vector< string > found;
+        for( ouriter i = _nsdMap.begin(); i != _nsdMap.end(); ++i )
+            if ( strncmp( i->first.c_str(), prefix, strlen( prefix ) ) == 0 )
+                found.push_back( i->first );
+        for( vector< string >::iterator i = found.begin(); i != found.end(); ++i ) {
+            _nsdMap.erase(*i);
+        }
+    }
+
+    void NamespaceDetailsTransient::computeIndexKeys() {
+        _keysComputed = true;
+        _indexKeys.clear();
+        NamespaceDetails *d = nsdetails(_ns.c_str());
+        if ( ! d )
+            return;
+        NamespaceDetails::IndexIterator i = d->ii();
+        while( i.more() )
+            i.next().keyPattern().getFieldNames(_indexKeys);
+    }
+
+
+    /* ------------------------------------------------------------------------- */
+
+    /* add a new namespace to the system catalog (<dbname>.system.namespaces).
+       options: { capped : ..., size : ... }
+    */
+    void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0) {
+        LOG(1) << "New namespace: " << ns << endl;
+        if ( strstr(ns, "system.namespaces") ) {
+            // system.namespaces holds all the others, so it is not explicitly listed in the catalog.
+            // TODO: fix above should not be strstr!
+            return;
+        }
+
+        {
+            BSONObjBuilder b;
+            b.append("name", ns);
+            if ( options )
+                b.append("options", *options);
+            BSONObj j = b.done();
+            char database[256];
+            nsToDatabase(ns, database);
+            string s = database;
+            if( cmdLine.configsvr && (s != "config" && s != "admin") ) { 
+                uasserted(14037, "can't create user databases on a --configsvr instance");
+            }
+            s += ".system.namespaces";
+            theDataFileMgr.insert(s.c_str(), j.objdata(), j.objsize(), true);
+        }
+    }
+
+    void renameNamespace( const char *from, const char *to ) {
+        NamespaceIndex *ni = nsindex( from );
+        assert( ni );
+        assert( ni->details( from ) );
+        assert( ! ni->details( to ) );
+
+        // Our namespace and index details will move to a different
+        // memory location.  The only references to namespace and
+        // index details across commands are in cursors and nsd
+        // transient (including query cache) so clear these.
+        ClientCursor::invalidate( from );
+        NamespaceDetailsTransient::eraseForPrefix( from );
+
+        NamespaceDetails *details = ni->details( from );
+        ni->add_ns( to, *details );
+        NamespaceDetails *todetails = ni->details( to );
+        try {
+            todetails->copyingFrom(to, details); // fixes extraOffset
+        }
+        catch( DBException& ) {
+            // could end up here if .ns is full - if so try to clean up / roll back a little
+            ni->kill_ns(to);
+            throw;
+        }
+        ni->kill_ns( from );
+        details = todetails;
+
+        BSONObj oldSpec;
+        char database[MaxDatabaseNameLen];
+        nsToDatabase(from, database);
+        string s = database;
+        s += ".system.namespaces";
+        assert( Helpers::findOne( s.c_str(), BSON( "name" << from ), oldSpec ) );
+
+        BSONObjBuilder newSpecB;
+        BSONObjIterator i( oldSpec.getObjectField( "options" ) );
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if ( strcmp( e.fieldName(), "create" ) != 0 )
+                newSpecB.append( e );
+            else
+                newSpecB << "create" << to;
+        }
+        BSONObj newSpec = newSpecB.done();
+        addNewNamespaceToCatalog( to, newSpec.isEmpty() ? 0 : &newSpec );
+
+        deleteObjects( s.c_str(), BSON( "name" << from ), false, false, true );
+        // oldSpec variable no longer valid memory
+
+        BSONObj oldIndexSpec;
+        s = database;
+        s += ".system.indexes";
+        while( Helpers::findOne( s.c_str(), BSON( "ns" << from ), oldIndexSpec ) ) {
+            BSONObjBuilder newIndexSpecB;
+            BSONObjIterator i( oldIndexSpec );
+            while( i.more() ) {
+                BSONElement e = i.next();
+                if ( strcmp( e.fieldName(), "ns" ) != 0 )
+                    newIndexSpecB.append( e );
+                else
+                    newIndexSpecB << "ns" << to;
+            }
+            BSONObj newIndexSpec = newIndexSpecB.done();
+            DiskLoc newIndexSpecLoc = theDataFileMgr.insert( s.c_str(), newIndexSpec.objdata(), newIndexSpec.objsize(), true, false );
+            int indexI = details->findIndexByName( oldIndexSpec.getStringField( "name" ) );
+            IndexDetails &indexDetails = details->idx(indexI);
+            string oldIndexNs = indexDetails.indexNamespace();
+            indexDetails.info = newIndexSpecLoc;
+            string newIndexNs = indexDetails.indexNamespace();
+
+            renameIndexNamespace( oldIndexNs.c_str(), newIndexNs.c_str() );
+            deleteObjects( s.c_str(), oldIndexSpec.getOwned(), true, false, true );
+        }
+    }
+
+    bool legalClientSystemNS( const string& ns , bool write ) {
+        if( ns == "local.system.replset" ) return true;
+
+        if ( ns.find( ".system.users" ) != string::npos )
+            return true;
+
+        if ( ns.find( ".system.js" ) != string::npos ) {
+            if ( write )
+                Scope::storedFuncMod();
+            return true;
+        }
+
+        return false;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/namespace.h b/src/mongo/db/namespace.h
new file mode 100644
index 00000000000..9ceb6a6f4e9
--- /dev/null
+++ b/src/mongo/db/namespace.h
@@ -0,0 +1,629 @@
+// namespace.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "namespacestring.h"
+#include "jsobj.h"
+#include "querypattern.h"
+#include "diskloc.h"
+#include "../util/hashtab.h"
+#include "mongommf.h"
+#include "d_concurrency.h"
+
+namespace mongo {
+
+    class Database;
+
+#pragma pack(1)
+    /* This helper class is used to make the HashMap below in NamespaceIndex e.g. see line:
+          HashTable<Namespace,NamespaceDetails> *ht;
+    */
+    class Namespace {
+    public:
+        explicit Namespace(const char *ns) { *this = ns; }
+        Namespace& operator=(const char *ns);
+
+        bool hasDollarSign() const { return strchr( buf , '$' ) > 0;  }
+        void kill() { buf[0] = 0x7f; }
+        bool operator==(const char *r) const { return strcmp(buf, r) == 0; }
+        bool operator==(const Namespace& r) const { return strcmp(buf, r.buf) == 0; }
+        int hash() const; // value returned is always > 0
+
+        size_t size() const { return strlen( buf ); }
+
+        string toString() const { return (string) buf; }
+        operator string() const { return (string) buf; }
+
+        /* NamespaceDetails::Extra was added after fact to allow chaining of data blocks to support more than 10 indexes
+           (more than 10 IndexDetails).  It's a bit hacky because of this late addition with backward
+           file support. */
+        string extraName(int i) const;
+        bool isExtra() const; /* ends with $extr... -- when true an extra block not a normal NamespaceDetails block */
+
+        /** ( foo.bar ).getSisterNS( "blah" ) == foo.blah
+            perhaps this should move to the NamespaceString helper?
+         */
+        string getSisterNS( const char * local ) const;
+
+        enum MaxNsLenValue { MaxNsLen = 128 };
+    private:
+        char buf[MaxNsLen];
+    };
+#pragma pack()
+
+} // namespace mongo
+
+#include "index.h"
+
+namespace mongo {
+
+    /** @return true if a client can modify this namespace even though it is under ".system."
+        For example <dbname>.system.users is ok for regular clients to update.
+        @param write used when .system.js
+    */
+    bool legalClientSystemNS( const string& ns , bool write );
+
+    /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes
+       so you can look for a deleterecord about the right size.
+    */
+    const int Buckets = 19;
+    const int MaxBucket = 18;
+
+    extern int bucketSizes[];
+
+#pragma pack(1)
+    /* NamespaceDetails : this is the "header" for a collection that has all its details.
+       It's in the .ns file and this is a memory mapped region (thus the pack pragma above).
+    */
+    class NamespaceDetails {
+    public:
+        enum { NIndexesMax = 64, NIndexesExtra = 30, NIndexesBase  = 10 };
+
+        /*-------- data fields, as present on disk : */
+        DiskLoc firstExtent;
+        DiskLoc lastExtent;
+        /* NOTE: capped collections v1 override the meaning of deletedList.
+                 deletedList[0] points to a list of free records (DeletedRecord's) for all extents in
+                 the capped namespace.
+                 deletedList[1] points to the last record in the prev extent.  When the "current extent"
+                 changes, this value is updated.  !deletedList[1].isValid() when this value is not
+                 yet computed.
+        */
+        DiskLoc deletedList[Buckets];
+        // ofs 168 (8 byte aligned)
+        struct Stats {
+            // datasize and nrecords MUST Be adjacent code assumes!
+            long long datasize; // this includes padding, but not record headers
+            long long nrecords;
+        } stats;
+        int lastExtentSize;
+        int nIndexes;
+    private:
+        // ofs 192
+        IndexDetails _indexes[NIndexesBase];
+    public:
+        // ofs 352 (16 byte aligned)
+        int capped;
+        int max;                              // max # of objects for a capped table.  TODO: should this be 64 bit?
+        double paddingFactor;                 // 1.0 = no padding.
+        // ofs 386 (16)
+        int flags;
+        DiskLoc capExtent;
+        DiskLoc capFirstNewRecord;
+        unsigned short dataFileVersion;       // NamespaceDetails version.  So we can do backward compatibility in the future. See filever.h
+        unsigned short indexFileVersion;
+        unsigned long long multiKeyIndexBits;
+    private:
+        // ofs 400 (16)
+        unsigned long long reservedA;
+        long long extraOffset;                // where the $extra info is located (bytes relative to this)
+    public:
+        int indexBuildInProgress;             // 1 if in prog
+        unsigned reservedB;
+        // ofs 424 (8)
+        struct Capped2 {
+            unsigned long long cc2_ptr;       // see capped.cpp
+            unsigned fileNumber;
+        } capped2;
+        char reserved[60];
+        /*-------- end data 496 bytes */
+
+        explicit NamespaceDetails( const DiskLoc &loc, bool _capped );
+
+        class Extra {
+            long long _next;
+        public:
+            IndexDetails details[NIndexesExtra];
+        private:
+            unsigned reserved2;
+            unsigned reserved3;
+            Extra(const Extra&) { assert(false); }
+            Extra& operator=(const Extra& r) { assert(false); return *this; }
+        public:
+            Extra() { }
+            long ofsFrom(NamespaceDetails *d) {
+                return ((char *) this) - ((char *) d);
+            }
+            void init() { memset(this, 0, sizeof(Extra)); }
+            Extra* next(NamespaceDetails *d) {
+                if( _next == 0 ) return 0;
+                return (Extra*) (((char *) d) + _next);
+            }
+            void setNext(long ofs) { *getDur().writing(&_next) = ofs;  }
+            void copy(NamespaceDetails *d, const Extra& e) {
+                memcpy(this, &e, sizeof(Extra));
+                _next = 0;
+            }
+        };
+        Extra* extra() {
+            if( extraOffset == 0 ) return 0;
+            return (Extra *) (((char *) this) + extraOffset);
+        }
+        /* add extra space for indexes when more than 10 */
+        Extra* allocExtra(const char *ns, int nindexessofar);
+        void copyingFrom(const char *thisns, NamespaceDetails *src); // must be called when renaming a NS to fix up extra
+
+        /* called when loaded from disk */
+        void onLoad(const Namespace& k);
+
+        /* dump info on this namespace.  for debugging. */
+        void dump(const Namespace& k);
+
+        /* dump info on all extents for this namespace.  for debugging. */
+        void dumpExtents();
+
+    private:
+        Extent *theCapExtent() const { return capExtent.ext(); }
+        void advanceCapExtent( const char *ns );
+        DiskLoc __capAlloc(int len);
+        DiskLoc cappedAlloc(const char *ns, int len);
+        DiskLoc &cappedFirstDeletedInCurExtent();
+        bool nextIsInCapExtent( const DiskLoc &dl ) const;
+
+    public:
+        DiskLoc& cappedListOfAllDeletedRecords() { return deletedList[0]; }
+        DiskLoc& cappedLastDelRecLastExtent()    { return deletedList[1]; }
+        void cappedDumpDelInfo();
+        bool capLooped() const { return capped && capFirstNewRecord.isValid();  }
+        bool inCapExtent( const DiskLoc &dl ) const;
+        void cappedCheckMigrate();
+        /**
+         * Truncate documents newer than the document at 'end' from the capped
+         * collection.  The collection cannot be completely emptied using this
+         * function.  An assertion will be thrown if that is attempted.
+         * @param inclusive - Truncate 'end' as well iff true
+         */
+        void cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive);
+        /** Remove all documents from the capped collection */
+        void emptyCappedCollection(const char *ns);
+
+        /* when a background index build is in progress, we don't count the index in nIndexes until
+           complete, yet need to still use it in _indexRecord() - thus we use this function for that.
+        */
+        int nIndexesBeingBuilt() const { return nIndexes + indexBuildInProgress; }
+
+        /* NOTE: be careful with flags.  are we manipulating them in read locks?  if so,
+                 this isn't thread safe.  TODO
+        */
+        enum NamespaceFlags {
+            Flag_HaveIdIndex = 1 << 0 // set when we have _id index (ONLY if ensureIdIndex was called -- 0 if that has never been called)
+        };
+
+        IndexDetails& idx(int idxNo, bool missingExpected = false );
+
+        /** get the IndexDetails for the index currently being built in the background. (there is at most one) */
+        IndexDetails& inProgIdx() {
+            DEV assert(indexBuildInProgress);
+            return idx(nIndexes);
+        }
+
+        class IndexIterator {
+        public:
+            int pos() { return i; } // note this is the next one to come
+            bool more() { return i < n; }
+            IndexDetails& next() { return d->idx(i++); }
+        private:
+            friend class NamespaceDetails;
+            int i, n;
+            NamespaceDetails *d;
+            IndexIterator(NamespaceDetails *_d);
+        };
+
+        IndexIterator ii() { return IndexIterator(this); }
+
+        /* hackish - find our index # in the indexes array */
+        int idxNo(IndexDetails& idx);
+
+        /* multikey indexes are indexes where there are more than one key in the index
+             for a single document. see multikey in wiki.
+           for these, we have to do some dedup work on queries.
+        */
+        bool isMultikey(int i) const { return (multiKeyIndexBits & (((unsigned long long) 1) << i)) != 0; }
+        void setIndexIsMultikey(int i) {
+            dassert( i < NIndexesMax );
+            unsigned long long x = ((unsigned long long) 1) << i;
+            if( multiKeyIndexBits & x ) return;
+            *getDur().writing(&multiKeyIndexBits) |= x;
+        }
+        void clearIndexIsMultikey(int i) {
+            dassert( i < NIndexesMax );
+            unsigned long long x = ((unsigned long long) 1) << i;
+            if( (multiKeyIndexBits & x) == 0 ) return;
+            *getDur().writing(&multiKeyIndexBits) &= ~x;
+        }
+
+        /* add a new index.  does not add to system.indexes etc. - just to NamespaceDetails.
+           caller must populate returned object.
+         */
+        IndexDetails& addIndex(const char *thisns, bool resetTransient=true);
+
+        void aboutToDeleteAnIndex() { 
+            *getDur().writing(&flags) = flags & ~Flag_HaveIdIndex;
+        }
+
+        /* returns index of the first index in which the field is present. -1 if not present. */
+        int fieldIsIndexed(const char *fieldName);
+
+        /* called to indicate that an update fit in place.  
+           fits also called on an insert -- idea there is that if you had some mix and then went to
+           pure inserts it would adapt and PF would trend to 1.0.  note update calls insert on a move
+           so there is a double count there that must be adjusted for below.
+
+           todo: greater sophistication could be helpful and added later.  for example the absolute 
+                 size of documents might be considered -- in some cases smaller ones are more likely 
+                 to grow than larger ones in the same collection? (not always)
+        */
+        void paddingFits() {
+            MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less
+                double x = paddingFactor - 0.001;
+                if ( x >= 1.0 ) {
+                    *getDur().writing(&paddingFactor) = x;
+                    //getDur().setNoJournal(&paddingFactor, &x, sizeof(x));
+                }
+            }
+        }
+        void paddingTooSmall() {            
+            MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less       
+                /* the more indexes we have, the higher the cost of a move.  so we take that into 
+                   account herein.  note on a move that insert() calls paddingFits(), thus
+                   here for example with no inserts and nIndexes = 1 we have
+                   .001*4-.001 or a 3:1 ratio to non moves -> 75% nonmoves.  insert heavy 
+                   can pushes this down considerably. further tweaking will be a good idea but 
+                   this should be an adequate starting point.
+                */
+                double N = min(nIndexes,7) + 3;
+                double x = paddingFactor + (0.001 * N);
+                if ( x <= 2.0 ) {
+                    *getDur().writing(&paddingFactor) = x;
+                    //getDur().setNoJournal(&paddingFactor, &x, sizeof(x));
+                }
+            }
+        }
+
+        // @return offset in indexes[]
+        int findIndexByName(const char *name);
+
+        // @return offset in indexes[]
+        int findIndexByKeyPattern(const BSONObj& keyPattern);
+
+        void findIndexByType( const string& name , vector<int>& matches ) {
+            IndexIterator i = ii();
+            while ( i.more() ) {
+                if ( i.next().getSpec().getTypeName() == name )
+                    matches.push_back( i.pos() - 1 );
+            }
+        }
+
+        /* @return -1 = not found
+           generally id is first index, so not that expensive an operation (assuming present).
+        */
+        int findIdIndex() {
+            IndexIterator i = ii();
+            while( i.more() ) {
+                if( i.next().isIdIndex() )
+                    return i.pos()-1;
+            }
+            return -1;
+        }
+
+        bool haveIdIndex() { 
+            return (flags & NamespaceDetails::Flag_HaveIdIndex) || findIdIndex() >= 0;
+        }
+
+        /* return which "deleted bucket" for this size object */
+        static int bucket(int n) {
+            for ( int i = 0; i < Buckets; i++ )
+                if ( bucketSizes[i] > n )
+                    return i;
+            return Buckets-1;
+        }
+
+        /* predetermine location of the next alloc without actually doing it. 
+           if cannot predetermine returns null (so still call alloc() then)
+        */
+        DiskLoc allocWillBeAt(const char *ns, int lenToAlloc);
+
+        /* allocate a new record.  lenToAlloc includes headers. */
+        DiskLoc alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc);
+
+        /* add a given record to the deleted chains for this NS */
+        void addDeletedRec(DeletedRecord *d, DiskLoc dloc);
+        void dumpDeleted(set<DiskLoc> *extents = 0);
+        // Start from firstExtent by default.
+        DiskLoc firstRecord( const DiskLoc &startExtent = DiskLoc() ) const;
+        // Start from lastExtent by default.
+        DiskLoc lastRecord( const DiskLoc &startExtent = DiskLoc() ) const;
+        long long storageSize( int * numExtents = 0 , BSONArrayBuilder * extentInfo = 0 ) const;
+
+        int averageObjectSize() {
+            if ( stats.nrecords == 0 )
+                return 5;
+            return (int) (stats.datasize / stats.nrecords);
+        }
+
+        NamespaceDetails *writingWithoutExtra() {
+            return ( NamespaceDetails* ) getDur().writingPtr( this, sizeof( NamespaceDetails ) );
+        }
+        /** Make all linked Extra objects writeable as well */
+        NamespaceDetails *writingWithExtra();
+
+    private:
+        DiskLoc _alloc(const char *ns, int len);
+        void maybeComplain( const char *ns, int len ) const;
+        DiskLoc __stdAlloc(int len, bool willBeAt);
+        void compact(); // combine adjacent deleted records
+        friend class NamespaceIndex;
+        struct ExtraOld {
+            // note we could use this field for more chaining later, so don't waste it:
+            unsigned long long reserved1;
+            IndexDetails details[NIndexesExtra];
+            unsigned reserved2;
+            unsigned reserved3;
+        };
+        /** Update cappedLastDelRecLastExtent() after capExtent changed in cappedTruncateAfter() */
+        void cappedTruncateLastDelUpdate();
+        BOOST_STATIC_ASSERT( NIndexesMax <= NIndexesBase + NIndexesExtra*2 );
+        BOOST_STATIC_ASSERT( NIndexesMax <= 64 ); // multiKey bits
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::ExtraOld) == 496 );
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) == 496 );
+    }; // NamespaceDetails
+#pragma pack()
+
+    /* NamespaceDetailsTransient
+
+       these are things we know / compute about a namespace that are transient -- things
+       we don't actually store in the .ns file.  so mainly caching of frequently used
+       information.
+
+       CAUTION: Are you maintaining this properly on a collection drop()?  A dropdatabase()?  Be careful.
+                The current field "allIndexKeys" may have too many keys in it on such an occurrence;
+                as currently used that does not cause anything terrible to happen.
+
+       todo: cleanup code, need abstractions and separation
+    */
+    // todo: multiple db's with the same name (repairDatbase) is not handled herein.  that may be 
+    //       the way to go, if not used by repair, but need some sort of enforcement / asserts.
+    class NamespaceDetailsTransient : boost::noncopyable {
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails) == 496 );
+
+        //Database *database;
+        const string _ns;
+        void reset();
+        static std::map< string, shared_ptr< NamespaceDetailsTransient > > _nsdMap;
+
+        NamespaceDetailsTransient(Database*,const char *ns);
+    public:
+        ~NamespaceDetailsTransient();
+        void addedIndex() { assertInWriteLock(); reset(); }
+        void deletedIndex() { assertInWriteLock(); reset(); }
+        /* Drop cached information on all namespaces beginning with the specified prefix.
+           Can be useful as index namespaces share the same start as the regular collection.
+           SLOW - sequential scan of all NamespaceDetailsTransient objects */
+        static void clearForPrefix(const char *prefix);
+        static void eraseForPrefix(const char *prefix);
+
+        /**
+         * @return a cursor interface to the query optimizer.  The implementation may
+         * utilize a single query plan or interleave results from multiple query
+         * plans before settling on a single query plan.  Note that the schema of
+         * currKey() documents, the matcher(), and the isMultiKey() nature of the
+         * cursor may change over the course of iteration.
+         *
+         * @param query - Query used to select indexes and populate matchers.
+         *
+         * @param order - Required ordering spec for documents produced by this cursor,
+         * empty object default indicates no order requirement.  If no index exists that
+         * satisfies the required sort order, an empty shared_ptr is returned.
+         *
+         * @param requireIndex - If true, no unindexed (ie collection scan) cursors are
+         * used to generate the returned cursor.  If an unindexed cursor is required, an
+         * assertion is raised by the cursor during iteration.
+         *
+         * @param simpleEqualityMatch - Set to true for certain simple queries -
+         * see queryoptimizer.cpp.
+         *
+         * The returned cursor may @throw inside of advance() or recoverFromYield() in
+         * certain error cases, for example if a capped overrun occurred during a yield.
+         * This indicates that the cursor was unable to perform a complete scan.
+         *
+         * This is a work in progress.  Partial list of features not yet implemented:
+         * - covered indexes
+         * - in memory sorting
+         */
+        static shared_ptr<Cursor> getCursor( const char *ns, const BSONObj &query,
+                                            const BSONObj &order = BSONObj(), bool requireIndex = false,
+                                            bool *simpleEqualityMatch = 0 );
+                                     
+        /* indexKeys() cache ---------------------------------------------------- */
+        /* assumed to be in write lock for this */
+    private:
+        bool _keysComputed;
+        set<string> _indexKeys;
+        void computeIndexKeys();
+    public:
+        /* get set of index keys for this namespace.  handy to quickly check if a given
+           field is indexed (Note it might be a secondary component of a compound index.)
+        */
+        set<string>& indexKeys() {
+            DEV assertInWriteLock();
+            if ( !_keysComputed )
+                computeIndexKeys();
+            return _indexKeys;
+        }
+
+        /* IndexSpec caching */
+    private:
+        map<const IndexDetails*,IndexSpec> _indexSpecs;
+        static SimpleMutex _isMutex;
+    public:
+        const IndexSpec& getIndexSpec( const IndexDetails * details ) {
+            IndexSpec& spec = _indexSpecs[details];
+            if ( ! spec._finishedInit ) {
+                SimpleMutex::scoped_lock lk(_isMutex);
+                if ( ! spec._finishedInit ) {
+                    spec.reset( details );
+                    assert( spec._finishedInit );
+                }
+            }
+            return spec;
+        }
+
+        /* query cache (for query optimizer) ------------------------------------- */
+    private:
+        int _qcWriteCount;
+        map< QueryPattern, pair< BSONObj, long long > > _qcCache;
+        static NamespaceDetailsTransient& make_inlock(const char *ns);
+    public:
+        static SimpleMutex _qcMutex;
+
+        /* you must be in the qcMutex when calling this.
+           A NamespaceDetailsTransient object will not go out of scope on you if you are
+           d.dbMutex.atLeastReadLocked(), so you do't have to stay locked.
+           Creates a NamespaceDetailsTransient before returning if one DNE. 
+           todo: avoid creating too many on erroneous ns queries.
+           */
+        static NamespaceDetailsTransient& get_inlock(const char *ns);
+
+        static NamespaceDetailsTransient& get(const char *ns) {
+            SimpleMutex::scoped_lock lk(_qcMutex);
+            return get_inlock(ns);
+        }
+
+        void clearQueryCache() { // public for unit tests
+            _qcCache.clear();
+            _qcWriteCount = 0;
+        }
+        /* you must notify the cache if you are doing writes, as query plan optimality will change */
+        void notifyOfWriteOp() {
+            if ( _qcCache.empty() )
+                return;
+            if ( ++_qcWriteCount >= 100 )
+                clearQueryCache();
+        }
+        BSONObj indexForPattern( const QueryPattern &pattern ) {
+            return _qcCache[ pattern ].first;
+        }
+        long long nScannedForPattern( const QueryPattern &pattern ) {
+            return _qcCache[ pattern ].second;
+        }
+        void registerIndexForPattern( const QueryPattern &pattern, const BSONObj &indexKey, long long nScanned ) {
+            _qcCache[ pattern ] = make_pair( indexKey, nScanned );
+        }
+
+    }; /* NamespaceDetailsTransient */
+
+    inline NamespaceDetailsTransient& NamespaceDetailsTransient::get_inlock(const char *ns) {
+        std::map< string, shared_ptr< NamespaceDetailsTransient > >::iterator i = _nsdMap.find(ns);
+        if( i != _nsdMap.end() && 
+            i->second.get() ) { // could be null ptr from clearForPrefix
+            return *i->second;
+        }
+        return make_inlock(ns);
+    }
+
+    /* NamespaceIndex is the ".ns" file you see in the data directory.  It is the "system catalog"
+       if you will: at least the core parts.  (Additional info in system.* collections.)
+    */
+    class NamespaceIndex {
+    public:
+        NamespaceIndex(const string &dir, const string &database) :
+            ht( 0 ), dir_( dir ), database_( database ) {}
+
+        /* returns true if new db will be created if we init lazily */
+        bool exists() const;
+
+        void init() {
+            if( !ht ) 
+                _init();
+        }
+
+        void add_ns(const char *ns, DiskLoc& loc, bool capped);
+        void add_ns( const char *ns, const NamespaceDetails &details );
+
+        NamespaceDetails* details(const char *ns) {
+            if ( !ht )
+                return 0;
+            Namespace n(ns);
+            NamespaceDetails *d = ht->get(n);
+            if ( d && d->capped )
+                d->cappedCheckMigrate();
+            return d;
+        }
+
+        void kill_ns(const char *ns);
+
+        bool find(const char *ns, DiskLoc& loc) {
+            NamespaceDetails *l = details(ns);
+            if ( l ) {
+                loc = l->firstExtent;
+                return true;
+            }
+            return false;
+        }
+
+        bool allocated() const { return ht != 0; }
+
+        void getNamespaces( list<string>& tofill , bool onlyCollections = true ) const;
+
+        NamespaceDetails::Extra* newExtra(const char *ns, int n, NamespaceDetails *d);
+
+        boost::filesystem::path path() const;
+
+        unsigned long long fileLength() const { return f.length(); }
+
+    private:
+        void _init();
+        void maybeMkdir() const;
+
+        MongoMMF f;
+        HashTable<Namespace,NamespaceDetails> *ht;
+        string dir_;
+        string database_;
+    };
+
+    extern string dbpath; // --dbpath parm
+    extern bool directoryperdb;
+
+    // Rename a namespace within current 'client' db.
+    // (Arguments should include db name)
+    void renameNamespace( const char *from, const char *to );
+
+
+} // namespace mongo
diff --git a/src/mongo/db/namespacestring.h b/src/mongo/db/namespacestring.h
new file mode 100644
index 00000000000..d982c5fff75
--- /dev/null
+++ b/src/mongo/db/namespacestring.h
@@ -0,0 +1,147 @@
+// @file namespacestring.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <string>
+
+namespace mongo {
+
+    using std::string;
+
+    /* in the mongo source code, "client" means "database". */
+
+    const int MaxDatabaseNameLen = 256; // max str len for the db name, including null char
+
+    /* e.g.
+       NamespaceString ns("acme.orders");
+       cout << ns.coll; // "orders"
+    */
+    class NamespaceString {
+    public:
+        string db;
+        string coll; // note collection names can have periods in them for organizing purposes (e.g. "system.indexes")
+
+        NamespaceString( const char * ns ) { init(ns); }
+        NamespaceString( const string& ns ) { init(ns.c_str()); }
+
+        string ns() const { return db + '.' + coll; }
+
+        bool isSystem() const { return strncmp(coll.c_str(), "system.", 7) == 0; }
+        bool isCommand() const { return coll == "$cmd"; }
+
+        operator string() const { return ns(); }
+
+        bool operator==( const string& nsIn ) const { return nsIn == ns(); }
+        bool operator==( const char* nsIn ) const { return (string)nsIn == ns(); }
+        bool operator==( const NamespaceString& nsIn ) const { return nsIn.db == db && nsIn.coll == coll; }
+
+        bool operator!=( const string& nsIn ) const { return nsIn != ns(); }
+        bool operator!=( const char* nsIn ) const { return (string)nsIn != ns(); }
+        bool operator!=( const NamespaceString& nsIn ) const { return nsIn.db != db || nsIn.coll != coll; }
+
+        string toString() const { return ns(); }
+
+        /**
+         * @return true if ns is 'normal'.  $ used for collections holding index data, which do not contain BSON objects in their records.
+         * special case for the local.oplog.$main ns -- naming it as such was a mistake.
+         */
+        static bool normal(const char* ns) {
+            const char *p = strchr(ns, '$');
+            if( p == 0 )
+                return true;
+            return strcmp( ns, "local.oplog.$main" ) == 0;
+        }
+
+        static bool special(const char *ns) { 
+            return !normal(ns) || strstr(ns, ".system.");
+        }
+
+        /**
+         * samples:
+         *   good:  
+         *      foo  
+         *      bar
+         *      foo-bar
+         *   bad:
+         *      foo bar
+         *      foo.bar
+         *      foo"bar
+         *        
+         * @param db - a possible database name
+         * @return if db is an allowed database name
+         */
+        static bool validDBName( const string& db ) {
+            if ( db.size() == 0 || db.size() > 64 )
+                return false;
+            size_t good = strcspn( db.c_str() , "/\\. \"" );
+            return good == db.size();
+        }
+
+        /**
+         * samples:
+         *   good:
+         *      foo.bar
+         *   bad:
+         *      foo.
+         *
+         * @param dbcoll - a possible collection name of the form db.coll
+         * @return if db.coll is an allowed collection name
+         */
+        static bool validCollectionName(const char* dbcoll){
+          const char *c = strchr( dbcoll, '.' ) + 1;
+          return normal(dbcoll) && c && *c;
+        }
+
+    private:
+        void init(const char *ns) {
+            const char *p = strchr(ns, '.');
+            if( p == 0 ) return;
+            db = string(ns, p - ns);
+            coll = p + 1;
+        }
+    };
+
+    // "database.a.b.c" -> "database"
+    inline void nsToDatabase(const char *ns, char *database) {
+        const char *p = ns;
+        char *q = database;
+        while ( *p != '.' ) {
+            if ( *p == 0 )
+                break;
+            *q++ = *p++;
+        }
+        *q = 0;
+        if (q-database>=MaxDatabaseNameLen) {
+            log() << "nsToDatabase: ns too long. terminating, buf overrun condition" << endl;
+            dbexit( EXIT_POSSIBLE_CORRUPTION );
+        }
+    }
+    inline string nsToDatabase(const char *ns) {
+        char buf[MaxDatabaseNameLen];
+        nsToDatabase(ns, buf);
+        return buf;
+    }
+    inline string nsToDatabase(const string& ns) {
+        size_t i = ns.find( '.' );
+        if ( i == string::npos )
+            return ns;
+        return ns.substr( 0 , i );
+    }
+
+}
diff --git a/src/mongo/db/nonce.cpp b/src/mongo/db/nonce.cpp
new file mode 100644
index 00000000000..379e88f116d
--- /dev/null
+++ b/src/mongo/db/nonce.cpp
@@ -0,0 +1,95 @@
+// nonce.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "nonce.h"
+#include "../util/time_support.h"
+
+extern int do_md5_test(void);
+
+namespace mongo {
+
+    BOOST_STATIC_ASSERT( sizeof(nonce64) == 8 );
+
+    static Security security; // needs to be static so _initialized is preset to false (see initsafe below)
+
+    Security::Security() {
+        static int n;
+        massert( 10352 , "Security is a singleton class", ++n == 1);
+        init();
+    }
+
+    NOINLINE_DECL void Security::init() {
+        if( _initialized ) return;
+        _initialized = true;
+
+#if defined(__linux__) || defined(__sunos__) || defined(__APPLE__)
+        _devrandom = new ifstream("/dev/urandom", ios::binary|ios::in);
+        massert( 10353 ,  "can't open dev/urandom", _devrandom->is_open() );
+#elif defined(_WIN32)
+        srand(curTimeMicros()); // perhaps not relevant for rand_s but we might want elsewhere anyway
+#else
+        srandomdev();
+#endif
+
+#ifndef NDEBUG
+        if ( do_md5_test() )
+            massert( 10354 , "md5 unit test fails", false);
+#endif
+    }
+
+    nonce64 Security::__getNonce() { 
+        dassert( _initialized );
+        nonce64 n;
+#if defined(__linux__) || defined(__sunos__) || defined(__APPLE__)
+        _devrandom->read((char*)&n, sizeof(n));
+        massert(10355 , "devrandom failed", !_devrandom->fail());
+#elif defined(_WIN32)
+        unsigned a=0, b=0;
+        assert( rand_s(&a) == 0 );
+        assert( rand_s(&b) == 0 );
+        n = (((unsigned long long)a)<<32) | b;
+#else
+        n = (((unsigned long long)random())<<32) | random();
+#endif
+        return n;
+    }
+
+    SimpleMutex nonceMutex("nonce");
+    nonce64 Security::_getNonce() {
+        // not good this is a static as gcc will mutex protect it which costs time
+        SimpleMutex::scoped_lock lk(nonceMutex);
+        if( !_initialized )
+            init();
+        return __getNonce();
+    }
+
+    nonce64 Security::getNonceDuringInit() {
+        // the mutex might not be inited yet.  init phase should be one thread anyway (hopefully we don't spawn threads therein)
+        if( !security._initialized )
+            security.init();
+        return security.__getNonce();
+    }
+
+    nonce64 Security::getNonce() {
+        return security._getNonce();
+    }
+
+    // name warns us this might be a little slow (see code above)
+    unsigned goodRandomNumberSlow() { return (unsigned) Security::getNonce(); }
+
+} // namespace mongo
diff --git a/src/mongo/db/nonce.h b/src/mongo/db/nonce.h
new file mode 100644
index 00000000000..d6a147ab1c0
--- /dev/null
+++ b/src/mongo/db/nonce.h
@@ -0,0 +1,36 @@
+// @file nonce.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+namespace mongo {
+
+    typedef unsigned long long nonce64;
+
+    struct Security {
+        Security();
+        static nonce64 getNonce();
+        static nonce64 getNonceDuringInit(); // use this version during global var constructors
+    private:
+        nonce64 _getNonce();
+        nonce64 __getNonce();
+        ifstream *_devrandom;
+        bool _initialized;
+        void init(); // can call more than once
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/oplog.cpp b/src/mongo/db/oplog.cpp
new file mode 100644
index 00000000000..342f362a28f
--- /dev/null
+++ b/src/mongo/db/oplog.cpp
@@ -0,0 +1,872 @@
+// @file oplog.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "oplog.h"
+#include "repl_block.h"
+#include "repl.h"
+#include "commands.h"
+#include "repl/rs.h"
+#include "stats/counters.h"
+#include "../util/file.h"
+#include "../util/unittest.h"
+#include "queryoptimizer.h"
+#include "ops/update.h"
+#include "ops/delete.h"
+#include "ops/query.h"
+
+namespace mongo {
+
+    void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt );
+
+    int __findingStartInitialTimeout = 5; // configurable for testing
+
+    // cached copies of these...so don't rename them, drop them, etc.!!!
+    static NamespaceDetails *localOplogMainDetails = 0;
+    static Database *localDB = 0;
+    static NamespaceDetails *rsOplogDetails = 0;
+    void oplogCheckCloseDatabase( Database * db ) {
+        localDB = 0;
+        localOplogMainDetails = 0;
+        rsOplogDetails = 0;
+        resetSlaveCache();
+    }
+
+    static void _logOpUninitialized(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) {
+        uassert(13288, "replSet error write op to db before replSet initialized", str::startsWith(ns, "local.") || *opstr == 'n');
+    }
+
+    /** write an op to the oplog that is already built.
+        todo : make _logOpRS() call this so we don't repeat ourself?
+        */
+    void _logOpObjRS(const BSONObj& op) {
+        DEV assertInWriteLock();
+
+        const OpTime ts = op["ts"]._opTime();
+        long long h = op["h"].numberLong();
+
+        {
+            const char *logns = rsoplog;
+            if ( rsOplogDetails == 0 ) {
+                Client::Context ctx( logns , dbpath, false);
+                localDB = ctx.db();
+                assert( localDB );
+                rsOplogDetails = nsdetails(logns);
+                massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails);
+            }
+            Client::Context ctx( logns , localDB, false );
+            {
+                int len = op.objsize();
+                Record *r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len);
+                memcpy(getDur().writingPtr(r->data, len), op.objdata(), len);
+            }
+            /* todo: now() has code to handle clock skew.  but if the skew server to server is large it will get unhappy.
+                     this code (or code in now() maybe) should be improved.
+                     */
+            if( theReplSet ) {
+                if( !(theReplSet->lastOpTimeWritten<ts) ) {
+                    log() << "replSet error possible failover clock skew issue? " << theReplSet->lastOpTimeWritten.toString() << ' ' << endl;
+                }
+                theReplSet->lastOpTimeWritten = ts;
+                theReplSet->lastH = h;
+                ctx.getClient()->setLastOp( ts );
+            }
+        }
+    }
+
+    /** given a BSON object, create a new one at dst which is the existing (partial) object
+        with a new object element appended at the end with fieldname "o".
+
+        @param partial already build object with everything except the o member.  e.g. something like:
+               { ts:..., ns:..., os2:... }
+        @param o a bson object to be added with fieldname "o"
+        @dst   where to put the newly built combined object.  e.g. ends up as something like:
+               { ts:..., ns:..., os2:..., o:... }
+    */
+    void append_O_Obj(char *dst, const BSONObj& partial, const BSONObj& o) {
+        const int size1 = partial.objsize() - 1;  // less the EOO char
+        const int oOfs = size1+3;                 // 3 = byte BSONOBJTYPE + byte 'o' + byte \0
+
+        void *p = getDur().writingPtr(dst, oOfs+o.objsize()+1);
+
+        memcpy(p, partial.objdata(), size1);
+
+        // adjust overall bson object size for the o: field
+        *(static_cast<unsigned*>(p)) += o.objsize() + 1/*fieldtype byte*/ + 2/*"o" fieldname*/;
+
+        char *b = static_cast<char *>(p);
+        b += size1;
+        *b++ = (char) Object;
+        *b++ = 'o'; // { o : ... }
+        *b++ = 0;   // null terminate "o" fieldname
+        memcpy(b, o.objdata(), o.objsize());
+        b += o.objsize();
+        *b = EOO;
+    }
+
+    // global is safe as we are in write lock. we put the static outside the function to avoid the implicit mutex 
+    // the compiler would use if inside the function.  the reason this is static is to avoid a malloc/free for this
+    // on every logop call.
+    static BufBuilder logopbufbuilder(8*1024);
+    static void _logOpRS(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) {
+        DEV assertInWriteLock();
+
+        if ( strncmp(ns, "local.", 6) == 0 ) {
+            if ( strncmp(ns, "local.slaves", 12) == 0 )
+                resetSlaveCache();
+            return;
+        }
+
+        const OpTime ts = OpTime::now();
+        long long hashNew;
+        if( theReplSet ) {
+            massert(13312, "replSet error : logOp() but not primary?", theReplSet->box.getState().primary());
+            hashNew = (theReplSet->lastH * 131 + ts.asLL()) * 17 + theReplSet->selfId();
+        }
+        else {
+            // must be initiation
+            assert( *ns == 0 );
+            hashNew = 0;
+        }
+
+        /* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
+           instead we do a single copy to the destination position in the memory mapped file.
+        */
+
+        logopbufbuilder.reset();
+        BSONObjBuilder b(logopbufbuilder);
+        b.appendTimestamp("ts", ts.asDate());
+        b.append("h", hashNew);
+        b.append("op", opstr);
+        b.append("ns", ns);
+        if ( bb )
+            b.appendBool("b", *bb);
+        if ( o2 )
+            b.append("o2", *o2);
+        BSONObj partial = b.done();
+        int posz = partial.objsize();
+        int len = posz + obj.objsize() + 1 + 2 /*o:*/;
+
+        Record *r;
+        DEV assert( logNS == 0 );
+        {
+            const char *logns = rsoplog;
+            if ( rsOplogDetails == 0 ) {
+                Client::Context ctx( logns , dbpath, false);
+                localDB = ctx.db();
+                assert( localDB );
+                rsOplogDetails = nsdetails(logns);
+                massert(13347, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails);
+            }
+            Client::Context ctx( logns , localDB, false );
+            r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len);
+            /* todo: now() has code to handle clock skew.  but if the skew server to server is large it will get unhappy.
+                     this code (or code in now() maybe) should be improved.
+                     */
+            if( theReplSet ) {
+                if( !(theReplSet->lastOpTimeWritten<ts) ) {
+                    log() << "replSet ERROR possible failover clock skew issue? " << theReplSet->lastOpTimeWritten << ' ' << ts << rsLog;
+                    log() << "replSet " << theReplSet->isPrimary() << rsLog;
+                }
+                theReplSet->lastOpTimeWritten = ts;
+                theReplSet->lastH = hashNew;
+                ctx.getClient()->setLastOp( ts );
+            }
+        }
+
+        append_O_Obj(r->data, partial, obj);
+
+        if ( logLevel >= 6 ) {
+            BSONObj temp(r);
+            log( 6 ) << "logOp:" << temp << endl;
+        }
+    }
+
+    /* we write to local.opload.$main:
+         { ts : ..., op: ..., ns: ..., o: ... }
+       ts: an OpTime timestamp
+       op:
+        "i" insert
+        "u" update
+        "d" delete
+        "c" db cmd
+        "db" declares presence of a database (ns is set to the db name + '.')
+        "n" no op
+       logNS - where to log it.  0/null means "local.oplog.$main".
+       bb:
+         if not null, specifies a boolean to pass along to the other side as b: param.
+         used for "justOne" or "upsert" flags on 'd', 'u'
+       first: true
+         when set, indicates this is the first thing we have logged for this database.
+         thus, the slave does not need to copy down all the data when it sees this.
+
+       note this is used for single collection logging even when --replSet is enabled.
+    */
+    static void _logOpOld(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) {
+        DEV assertInWriteLock();
+        static BufBuilder bufbuilder(8*1024);
+
+        if ( strncmp(ns, "local.", 6) == 0 ) {
+            if ( strncmp(ns, "local.slaves", 12) == 0 ) {
+                resetSlaveCache();
+            }
+            return;
+        }
+
+        const OpTime ts = OpTime::now();
+        Client::Context context("",0,false);
+
+        /* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
+           instead we do a single copy to the destination position in the memory mapped file.
+        */
+
+        bufbuilder.reset();
+        BSONObjBuilder b(bufbuilder);
+        b.appendTimestamp("ts", ts.asDate());
+        b.append("op", opstr);
+        b.append("ns", ns);
+        if ( bb )
+            b.appendBool("b", *bb);
+        if ( o2 )
+            b.append("o2", *o2);
+        BSONObj partial = b.done(); // partial is everything except the o:... part.
+
+        int po_sz = partial.objsize();
+        int len = po_sz + obj.objsize() + 1 + 2 /*o:*/;
+
+        Record *r;
+        if( logNS == 0 ) {
+            logNS = "local.oplog.$main";
+            if ( localOplogMainDetails == 0 ) {
+                Client::Context ctx( logNS , dbpath, false);
+                localDB = ctx.db();
+                assert( localDB );
+                localOplogMainDetails = nsdetails(logNS);
+                assert( localOplogMainDetails );
+            }
+            Client::Context ctx( logNS , localDB, false );
+            r = theDataFileMgr.fast_oplog_insert(localOplogMainDetails, logNS, len);
+        }
+        else {
+            Client::Context ctx( logNS, dbpath, false );
+            assert( nsdetails( logNS ) );
+            // first we allocate the space, then we fill it below.
+            r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len);
+        }
+
+        append_O_Obj(r->data, partial, obj);
+
+        context.getClient()->setLastOp( ts );
+
+        if ( logLevel >= 6 ) {
+            BSONObj temp(r);
+            log( 6 ) << "logging op:" << temp << endl;
+        }
+
+    }
+
+    static void (*_logOp)(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) = _logOpOld;
+    void newReplUp() {
+        replSettings.master = true;
+        _logOp = _logOpRS;
+    }
+    void newRepl() {
+        replSettings.master = true;
+        _logOp = _logOpUninitialized;
+    }
+    void oldRepl() { _logOp = _logOpOld; }
+
+    void logKeepalive() {
+        _logOp("n", "", 0, BSONObj(), 0, 0);
+    }
+    void logOpComment(const BSONObj& obj) {
+        _logOp("n", "", 0, obj, 0, 0);
+    }
+    void logOpInitiate(const BSONObj& obj) {
+        _logOpRS("n", "", 0, obj, 0, 0);
+    }
+
+    /*@ @param opstr:
+          c userCreateNS
+          i insert
+          n no-op / keepalive
+          d delete / remove
+          u update
+    */
+    void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b) {
+        if ( replSettings.master ) {
+            _logOp(opstr, ns, 0, obj, patt, b);
+        }
+
+        logOpForSharding( opstr , ns , obj , patt );
+    }
+
+    void createOplog() {
+        dblock lk;
+
+        const char * ns = "local.oplog.$main";
+
+        bool rs = !cmdLine._replSet.empty();
+        if( rs )
+            ns = rsoplog;
+
+        Client::Context ctx(ns);
+
+        NamespaceDetails * nsd = nsdetails( ns );
+
+        if ( nsd ) {
+
+            if ( cmdLine.oplogSize != 0 ) {
+                int o = (int)(nsd->storageSize() / ( 1024 * 1024 ) );
+                int n = (int)(cmdLine.oplogSize / ( 1024 * 1024 ) );
+                if ( n != o ) {
+                    stringstream ss;
+                    ss << "cmdline oplogsize (" << n << ") different than existing (" << o << ") see: http://dochub.mongodb.org/core/increase-oplog";
+                    log() << ss.str() << endl;
+                    throw UserException( 13257 , ss.str() );
+                }
+            }
+
+            if( rs ) return;
+
+            DBDirectClient c;
+            BSONObj lastOp = c.findOne( ns, Query().sort(reverseNaturalObj) );
+            if ( !lastOp.isEmpty() ) {
+                OpTime::setLast( lastOp[ "ts" ].date() );
+            }
+            return;
+        }
+
+        /* create an oplog collection, if it doesn't yet exist. */
+        BSONObjBuilder b;
+        double sz;
+        if ( cmdLine.oplogSize != 0 )
+            sz = (double)cmdLine.oplogSize;
+        else {
+            /* not specified. pick a default size */
+            sz = 50.0 * 1000 * 1000;
+            if ( sizeof(int *) >= 8 ) {
+#if defined(__APPLE__)
+                // typically these are desktops (dev machines), so keep it smallish
+                sz = (256-64) * 1000 * 1000;
+#else
+                sz = 990.0 * 1000 * 1000;
+                boost::intmax_t free = File::freeSpace(dbpath); //-1 if call not supported.
+                double fivePct = free * 0.05;
+                if ( fivePct > sz )
+                    sz = fivePct;
+#endif
+            }
+        }
+
+        log() << "******" << endl;
+        log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl;
+
+        b.append("size", sz);
+        b.appendBool("capped", 1);
+        b.appendBool("autoIndexId", false);
+
+        string err;
+        BSONObj o = b.done();
+        userCreateNS(ns, o, err, false);
+        if( !rs )
+            logOp( "n", "", BSONObj() );
+
+        /* sync here so we don't get any surprising lag later when we try to sync */
+        MemoryMappedFile::flushAll(true);
+        log() << "******" << endl;
+    }
+
+    // -------------------------------------
+
+    FindingStartCursor::FindingStartCursor( const QueryPlan & qp ) :
+    _qp( qp ),
+    _findingStart( true ),
+    _findingStartMode()
+    { init(); }
+    
+    void FindingStartCursor::next() {
+        if ( !_findingStartCursor || !_findingStartCursor->ok() ) {
+            _findingStart = false;
+            _c = _qp.newCursor(); // on error, start from beginning
+            destroyClientCursor();
+            return;
+        }
+        switch( _findingStartMode ) {
+            // Initial mode: scan backwards from end of collection
+            case Initial: {
+                if ( !_matcher->matchesCurrent( _findingStartCursor->c() ) ) {
+                    _findingStart = false; // found first record out of query range, so scan normally
+                    _c = _qp.newCursor( _findingStartCursor->currLoc() );
+                    destroyClientCursor();
+                    return;
+                }
+                _findingStartCursor->advance();
+                RARELY {
+                    if ( _findingStartTimer.seconds() >= __findingStartInitialTimeout ) {
+                        // If we've scanned enough, switch to find extent mode.
+                        createClientCursor( extentFirstLoc( _findingStartCursor->currLoc() ) );
+                        _findingStartMode = FindExtent;
+                        return;
+                    }
+                }
+                return;
+            }
+            // FindExtent mode: moving backwards through extents, check first
+            // document of each extent.
+            case FindExtent: {
+                if ( !_matcher->matchesCurrent( _findingStartCursor->c() ) ) {
+                    _findingStartMode = InExtent;
+                    return;
+                }
+                DiskLoc prev = prevExtentFirstLoc( _findingStartCursor->currLoc() );
+                if ( prev.isNull() ) { // hit beginning, so start scanning from here
+                    createClientCursor();
+                    _findingStartMode = InExtent;
+                    return;
+                }
+                // There might be a more efficient implementation than creating new cursor & client cursor each time,
+                // not worrying about that for now
+                createClientCursor( prev );
+                return;
+            }
+            // InExtent mode: once an extent is chosen, find starting doc in the extent.
+            case InExtent: {
+                if ( _matcher->matchesCurrent( _findingStartCursor->c() ) ) {
+                    _findingStart = false; // found first record in query range, so scan normally
+                    _c = _qp.newCursor( _findingStartCursor->currLoc() );
+                    destroyClientCursor();
+                    return;
+                }
+                _findingStartCursor->advance();
+                return;
+            }
+            default: {
+                massert( 14038, "invalid _findingStartMode", false );
+            }
+        }
+    }
+    
+    DiskLoc FindingStartCursor::extentFirstLoc( const DiskLoc &rec ) {
+        Extent *e = rec.rec()->myExtent( rec );
+        if ( !_qp.nsd()->capLooped() || ( e->myLoc != _qp.nsd()->capExtent ) )
+            return e->firstRecord;
+        // Likely we are on the fresh side of capExtent, so return first fresh record.
+        // If we are on the stale side of capExtent, then the collection is small and it
+        // doesn't matter if we start the extent scan with capFirstNewRecord.
+        return _qp.nsd()->capFirstNewRecord;
+    }
+    
+    void wassertExtentNonempty( const Extent *e ) {
+        // TODO ensure this requirement is clearly enforced, or fix.
+        wassert( !e->firstRecord.isNull() );
+    }
+    
+    DiskLoc FindingStartCursor::prevExtentFirstLoc( const DiskLoc &rec ) {
+        Extent *e = rec.rec()->myExtent( rec );
+        if ( _qp.nsd()->capLooped() ) {
+            if ( e->xprev.isNull() ) {
+                e = _qp.nsd()->lastExtent.ext();
+            }
+            else {
+                e = e->xprev.ext();
+            }
+            if ( e->myLoc != _qp.nsd()->capExtent ) {
+                wassertExtentNonempty( e );
+                return e->firstRecord;
+            }
+        }
+        else {
+            if ( !e->xprev.isNull() ) {
+                e = e->xprev.ext();
+                wassertExtentNonempty( e );
+                return e->firstRecord;
+            }
+        }
+        return DiskLoc(); // reached beginning of collection
+    }
+    
+    void FindingStartCursor::createClientCursor( const DiskLoc &startLoc ) {
+        shared_ptr<Cursor> c = _qp.newCursor( startLoc );
+        _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns()) );
+    }
+
+    bool FindingStartCursor::firstDocMatchesOrEmpty() const {
+        shared_ptr<Cursor> c = _qp.newCursor();
+        return !c->ok() || _matcher->matchesCurrent( c.get() );
+    }
+    
+    void FindingStartCursor::init() {
+        BSONElement tsElt = _qp.originalQuery()[ "ts" ];
+        massert( 13044, "no ts field in query", !tsElt.eoo() );
+        BSONObjBuilder b;
+        b.append( tsElt );
+        BSONObj tsQuery = b.obj();
+        _matcher.reset(new CoveredIndexMatcher(tsQuery, _qp.indexKey()));
+        if ( firstDocMatchesOrEmpty() ) {
+            _c = _qp.newCursor();
+            _findingStart = false;
+            return;
+        }
+        // Use a ClientCursor here so we can release db mutex while scanning
+        // oplog (can take quite a while with large oplogs).
+        shared_ptr<Cursor> c = _qp.newReverseCursor();
+        _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns(), BSONObj()) );
+        _findingStartTimer.reset();
+        _findingStartMode = Initial;
+    }
+    
+    // -------------------------------------
+
+    struct TestOpTime : public UnitTest {
+        void run() {
+            OpTime t;
+            for ( int i = 0; i < 10; i++ ) {
+                OpTime s = OpTime::now_inlock();
+                assert( s != t );
+                t = s;
+            }
+            OpTime q = t;
+            assert( q == t );
+            assert( !(q != t) );
+        }
+    } testoptime;
+
+    int _dummy_z;
+
+    void pretouchN(vector<BSONObj>& v, unsigned a, unsigned b) {
+        DEV assert( !d.dbMutex.isWriteLocked() );
+
+        Client *c = currentClient.get();
+        if( c == 0 ) {
+            Client::initThread("pretouchN");
+            c = &cc();
+        }
+
+        readlock lk("");
+        for( unsigned i = a; i <= b; i++ ) {
+            const BSONObj& op = v[i];
+            const char *which = "o";
+            const char *opType = op.getStringField("op");
+            if ( *opType == 'i' )
+                ;
+            else if( *opType == 'u' )
+                which = "o2";
+            else
+                continue;
+            /* todo : other operations */
+
+            try {
+                BSONObj o = op.getObjectField(which);
+                BSONElement _id;
+                if( o.getObjectID(_id) ) {
+                    const char *ns = op.getStringField("ns");
+                    BSONObjBuilder b;
+                    b.append(_id);
+                    BSONObj result;
+                    Client::Context ctx( ns );
+                    if( Helpers::findById(cc(), ns, b.done(), result) )
+                        _dummy_z += result.objsize(); // touch
+                }
+            }
+            catch( DBException& e ) {
+                log() << "ignoring assertion in pretouchN() " << a << ' ' << b << ' ' << i << ' ' << e.toString() << endl;
+            }
+        }
+    }
+
+    void pretouchOperation(const BSONObj& op) {
+
+        if( d.dbMutex.isWriteLocked() )
+            return; // no point pretouching if write locked. not sure if this will ever fire, but just in case.
+
+        const char *which = "o";
+        const char *opType = op.getStringField("op");
+        if ( *opType == 'i' )
+            ;
+        else if( *opType == 'u' )
+            which = "o2";
+        else
+            return;
+        /* todo : other operations */
+
+        try {
+            BSONObj o = op.getObjectField(which);
+            BSONElement _id;
+            if( o.getObjectID(_id) ) {
+                const char *ns = op.getStringField("ns");
+                BSONObjBuilder b;
+                b.append(_id);
+                BSONObj result;
+                readlock lk(ns);
+                Client::Context ctx( ns );
+                if( Helpers::findById(cc(), ns, b.done(), result) )
+                    _dummy_z += result.objsize(); // touch
+            }
+        }
+        catch( DBException& ) {
+            log() << "ignoring assertion in pretouchOperation()" << endl;
+        }
+    }
+
+    BSONObj Sync::getMissingDoc(const BSONObj& o) {
+        OplogReader missingObjReader;
+
+        uassert(15916, str::stream() << "Can no longer connect to initial sync source: " << hn, missingObjReader.connect(hn));
+
+        const char *ns = o.getStringField("ns");
+        // might be more than just _id in the update criteria
+        BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj();
+        BSONObj missingObj;
+        try {
+            missingObj = missingObjReader.findOne(ns, query);
+        } catch(DBException& e) {
+            log() << "replication assertion fetching missing object: " << e.what() << endl;
+            throw;
+        }
+
+        return missingObj;
+    }
+
+    bool Sync::shouldRetry(const BSONObj& o) {
+        // we don't have the object yet, which is possible on initial sync.  get it.
+        log() << "replication info adding missing object" << endl; // rare enough we can log
+
+        BSONObj missingObj = getMissingDoc(o);
+
+        if( missingObj.isEmpty() ) {
+            log() << "replication missing object not found on source. presumably deleted later in oplog" << endl;
+            log() << "replication o2: " << o.getObjectField("o2").toString() << endl;
+            log() << "replication o firstfield: " << o.getObjectField("o").firstElementFieldName() << endl;
+
+            return false;
+        }
+        else {
+            const char *ns = o.getStringField("ns");
+            Client::Context ctx(ns);
+            DiskLoc d = theDataFileMgr.insert(ns, (void*) missingObj.objdata(), missingObj.objsize());
+            uassert(15917, "Got bad disk location when attempting to insert", !d.isNull());
+
+            return true;
+        }
+    }
+
+    /** @param fromRepl false if from ApplyOpsCmd
+        @return true if was and update should have happened and the document DNE.  see replset initial sync code.
+     */
+    bool applyOperation_inlock(const BSONObj& op , bool fromRepl ) {
+        assertInWriteLock();
+        LOG(6) << "applying op: " << op << endl;
+        bool failedUpdate = false;
+
+        OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters;
+
+        const char *names[] = { "o", "ns", "op", "b" };
+        BSONElement fields[4];
+        op.getFields(4, names, fields);
+
+        BSONObj o;
+        if( fields[0].isABSONObj() )
+            o = fields[0].embeddedObject();
+            
+        const char *ns = fields[1].valuestrsafe();
+
+        // operation type -- see logOp() comments for types
+        const char *opType = fields[2].valuestrsafe();
+
+        if ( *opType == 'i' ) {
+            opCounters->gotInsert();
+
+            const char *p = strchr(ns, '.');
+            if ( p && strcmp(p, ".system.indexes") == 0 ) {
+                // updates aren't allowed for indexes -- so we will do a regular insert. if index already
+                // exists, that is ok.
+                theDataFileMgr.insert(ns, (void*) o.objdata(), o.objsize());
+            }
+            else {
+                // do upserts for inserts as we might get replayed more than once
+                OpDebug debug;
+                BSONElement _id;
+                if( !o.getObjectID(_id) ) {
+                    /* No _id.  This will be very slow. */
+                    Timer t;
+                    updateObjects(ns, o, o, true, false, false, debug );
+                    if( t.millis() >= 2 ) {
+                        RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl;
+                    }
+                }
+                else {
+                    /* erh 10/16/2009 - this is probably not relevant any more since its auto-created, but not worth removing */
+                    RARELY ensureHaveIdIndex(ns); // otherwise updates will be slow
+
+                    /* todo : it may be better to do an insert here, and then catch the dup key exception and do update
+                              then.  very few upserts will not be inserts...
+                              */
+                    BSONObjBuilder b;
+                    b.append(_id);
+                    updateObjects(ns, o, b.done(), true, false, false , debug );
+                }
+            }
+        }
+        else if ( *opType == 'u' ) {
+            opCounters->gotUpdate();
+            // dm do we create this for a capped collection?
+            //  - if not, updates would be slow
+            //    - but if were by id would be slow on primary too so maybe ok
+            //    - if on primary was by another key and there are other indexes, this could be very bad w/out an index
+            //  - if do create, odd to have on secondary but not primary.  also can cause secondary to block for 
+            //    quite a while on creation.  
+            RARELY ensureHaveIdIndex(ns); // otherwise updates will be super slow
+            OpDebug debug;
+            BSONObj updateCriteria = op.getObjectField("o2");
+            bool upsert = fields[3].booleanSafe();
+            UpdateResult ur = updateObjects(ns, o, updateCriteria, upsert, /*multi*/ false, /*logop*/ false , debug );
+            if( ur.num == 0 ) { 
+                if( ur.mod ) {
+                    if( updateCriteria.nFields() == 1 ) {
+                        // was a simple { _id : ... } update criteria
+                        failedUpdate = true; 
+                        // todo: probably should assert in these failedUpdate cases if not in initialSync
+                    }
+                    // need to check to see if it isn't present so we can set failedUpdate correctly.
+                    // note that adds some overhead for this extra check in some cases, such as an updateCriteria
+                    // of the form
+                    //   { _id:..., { x : {$size:...} }
+                    // thus this is not ideal.
+                    else {
+                        NamespaceDetails *nsd = nsdetails(ns);
+
+                        if (nsd == NULL ||
+                            (nsd->findIdIndex() >= 0 && Helpers::findById(nsd, updateCriteria).isNull()) ||
+                            // capped collections won't have an _id index
+                            (nsd->findIdIndex() < 0 && Helpers::findOne(ns, updateCriteria, false).isNull())) {
+                            failedUpdate = true;
+                        }
+
+                        // Otherwise, it's present; zero objects were updated because of additional specifiers
+                        // in the query for idempotence
+                    }
+                }
+                else { 
+                    // this could happen benignly on an oplog duplicate replay of an upsert
+                    // (because we are idempotent), 
+                    // if an regular non-mod update fails the item is (presumably) missing.
+                    if( !upsert ) {
+                        failedUpdate = true;
+                    }
+                }
+            }
+        }
+        else if ( *opType == 'd' ) {
+            opCounters->gotDelete();
+            if ( opType[1] == 0 )
+                deleteObjects(ns, o, /*justOne*/ fields[3].booleanSafe());
+            else
+                assert( opType[1] == 'b' ); // "db" advertisement
+        }
+        else if ( *opType == 'c' ) {
+            opCounters->gotCommand();
+            BufBuilder bb;
+            BSONObjBuilder ob;
+            _runCommands(ns, o, bb, ob, true, 0);
+        }
+        else if ( *opType == 'n' ) {
+            // no op
+        }
+        else {
+            throw MsgAssertionException( 14825 , ErrorMsg("error in applyOperation : unknown opType ", *opType) );
+        }
+        return failedUpdate;
+    }
+
+    class ApplyOpsCmd : public Command {
+    public:
+        virtual bool slaveOk() const { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        ApplyOpsCmd() : Command( "applyOps" ) {}
+        virtual void help( stringstream &help ) const {
+            help << "internal (sharding)\n{ applyOps : [ ] , preCondition : [ { ns : ... , q : ... , res : ... } ] }";
+        }
+        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+
+            if ( cmdObj.firstElement().type() != Array ) {
+                errmsg = "ops has to be an array";
+                return false;
+            }
+
+            BSONObj ops = cmdObj.firstElement().Obj();
+
+            {
+                // check input
+                BSONObjIterator i( ops );
+                while ( i.more() ) {
+                    BSONElement e = i.next();
+                    if ( e.type() == Object )
+                        continue;
+                    errmsg = "op not an object: ";
+                    errmsg += e.fieldName();
+                    return false;
+                }
+            }
+
+            if ( cmdObj["preCondition"].type() == Array ) {
+                BSONObjIterator i( cmdObj["preCondition"].Obj() );
+                while ( i.more() ) {
+                    BSONObj f = i.next().Obj();
+
+                    BSONObj realres = db.findOne( f["ns"].String() , f["q"].Obj() );
+
+                    Matcher m( f["res"].Obj() );
+                    if ( ! m.matches( realres ) ) {
+                        result.append( "got" , realres );
+                        result.append( "whatFailed" , f );
+                        errmsg = "pre-condition failed";
+                        return false;
+                    }
+                }
+            }
+
+            // apply
+            int num = 0;
+            BSONObjIterator i( ops );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                // todo SERVER-4259 ?
+                applyOperation_inlock( e.Obj() , false );
+                num++;
+            }
+
+            result.append( "applied" , num );
+
+            if ( ! fromRepl ) {
+                // We want this applied atomically on slaves
+                // so we re-wrap without the pre-condition for speed
+
+                string tempNS = str::stream() << dbname << ".$cmd";
+
+                logOp( "c" , tempNS.c_str() , cmdObj.firstElement().wrap() );
+            }
+
+            return true;
+        }
+
+        DBDirectClient db;
+
+    } applyOpsCmd;
+
+}
diff --git a/src/mongo/db/oplog.h b/src/mongo/db/oplog.h
new file mode 100644
index 00000000000..6c1644fe3ab
--- /dev/null
+++ b/src/mongo/db/oplog.h
@@ -0,0 +1,149 @@
+// oplog.h - writing to and reading from oplog
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+
+     local.oplog.$main is the default
+*/
+
+#pragma once
+
+#include "pdfile.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "clientcursor.h"
+#include "../client/dbclient.h"
+#include "../util/optime.h"
+#include "../util/timer.h"
+
+namespace mongo {
+
+    void createOplog();
+
+    void _logOpObjRS(const BSONObj& op);
+
+    /** Write operation to the log (local.oplog.$main)
+
+       @param opstr
+        "i" insert
+        "u" update
+        "d" delete
+        "c" db cmd
+        "n" no-op
+        "db" declares presence of a database (ns is set to the db name + '.')
+
+       See _logOp() in oplog.cpp for more details.
+    */
+    void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt = 0, bool *b = 0);
+
+    void logKeepalive();
+
+    /** puts obj in the oplog as a comment (a no-op).  Just for diags.
+        convention is
+          { msg : "text", ... }
+    */
+    void logOpComment(const BSONObj& obj);
+
+    void oplogCheckCloseDatabase( Database * db );
+
+    extern int __findingStartInitialTimeout; // configurable for testing
+
+    class QueryPlan;
+    
+    /** Implements an optimized procedure for finding the first op in the oplog. */
+    class FindingStartCursor {
+    public:
+
+        /**
+         * The cursor will attempt to find the first op in the oplog matching the
+         * 'ts' field of the qp's query.
+         */
+        FindingStartCursor( const QueryPlan & qp );
+
+        /** @return true if the first matching op in the oplog has been found. */
+        bool done() const { return !_findingStart; }
+
+        /** @return cursor pointing to the first matching op, if done(). */
+        shared_ptr<Cursor> cursor() { verify( 14835, done() ); return _c; }
+
+        /** Iterate the cursor, to continue trying to find matching op. */
+        void next();
+
+        /** Yield cursor, if not done(). */
+        bool prepareToYield() {
+            if ( _findingStartCursor ) {
+                return _findingStartCursor->prepareToYield( _yieldData );
+            }
+            return false;
+        }
+        
+        /** Recover from cursor yield. */
+        void recoverFromYield() {
+            if ( _findingStartCursor ) {
+                if ( !ClientCursor::recoverFromYield( _yieldData ) ) {
+                    _findingStartCursor.reset( 0 );
+                    msgassertedNoTrace( 15889, "FindingStartCursor::recoverFromYield() failed to recover" );
+                }
+            }
+        }
+    private:
+        enum FindingStartMode { Initial, FindExtent, InExtent };
+        const QueryPlan &_qp;
+        bool _findingStart;
+        FindingStartMode _findingStartMode;
+        auto_ptr< CoveredIndexMatcher > _matcher;
+        Timer _findingStartTimer;
+        ClientCursor::CleanupPointer _findingStartCursor;
+        shared_ptr<Cursor> _c;
+        ClientCursor::YieldData _yieldData;
+        DiskLoc extentFirstLoc( const DiskLoc &rec );
+
+        DiskLoc prevExtentFirstLoc( const DiskLoc &rec );
+        void createClientCursor( const DiskLoc &startLoc = DiskLoc() );
+        void destroyClientCursor() {
+            _findingStartCursor.reset( 0 );
+        }
+        void init();
+        bool firstDocMatchesOrEmpty() const;
+    };
+
+    class Sync {
+    protected:
+        string hn;
+    public:
+        Sync(const string& hostname) : hn(hostname) {}
+        virtual ~Sync() {}
+        virtual BSONObj getMissingDoc(const BSONObj& o);
+
+        /**
+         * If applyOperation_inlock should be called again after an update fails.
+         */
+        virtual bool shouldRetry(const BSONObj& o);
+    };
+
+    void pretouchOperation(const BSONObj& op);
+    void pretouchN(vector<BSONObj>&, unsigned a, unsigned b);
+
+    /**
+     * take an op and apply locally
+     * used for applying from an oplog
+     * @param fromRepl really from replication or for testing/internal/command/etc...
+     * Returns if the op was an update that could not be applied (true on failure)
+     */
+    bool applyOperation_inlock(const BSONObj& op , bool fromRepl = true );
+}
diff --git a/src/mongo/db/oplogreader.h b/src/mongo/db/oplogreader.h
new file mode 100644
index 00000000000..6efd1469c01
--- /dev/null
+++ b/src/mongo/db/oplogreader.h
@@ -0,0 +1,121 @@
+/** @file oplogreader.h */
+
+#pragma once
+
+#include "../client/dbclient.h"
+#include "../client/constants.h"
+#include "dbhelpers.h"
+
+namespace mongo {
+
+    /* started abstracting out the querying of the primary/master's oplog
+       still fairly awkward but a start.
+    */
+    class OplogReader {
+        shared_ptr<DBClientConnection> _conn;
+        shared_ptr<DBClientCursor> cursor;
+    public:
+        OplogReader() { }
+        ~OplogReader() { }
+        void resetCursor() { cursor.reset(); }
+        void resetConnection() {
+            cursor.reset();
+            _conn.reset();
+        }
+        DBClientConnection* conn() { return _conn.get(); }
+        BSONObj findOne(const char *ns, const Query& q) {
+            return conn()->findOne(ns, q, 0, QueryOption_SlaveOk);
+        }
+        BSONObj getLastOp(const char *ns) {
+            return findOne(ns, Query().sort(reverseNaturalObj));
+        }
+
+        /* ok to call if already connected */
+        bool connect(string hostname);
+
+        bool connect(const BSONObj& rid, const int from, const string& to);
+
+        void tailCheck() {
+            if( cursor.get() && cursor->isDead() ) {
+                log() << "repl: old cursor isDead, will initiate a new one" << endl;
+                resetCursor();
+            }
+        }
+
+        bool haveCursor() { return cursor.get() != 0; }
+
+        /** this is ok but commented out as when used one should consider if QueryOption_OplogReplay
+           is needed; if not fine, but if so, need to change.
+        *//*
+        void query(const char *ns, const BSONObj& query) {
+            assert( !haveCursor() );
+            cursor.reset( _conn->query(ns, query, 0, 0, 0, QueryOption_SlaveOk).release() );
+        }*/
+
+        /** this can be used; it is commented out as it does not indicate
+            QueryOption_OplogReplay and that is likely important.  could be uncommented
+            just need to add that.
+            */
+        /*
+        void queryGTE(const char *ns, OpTime t) {
+            BSONObjBuilder q;
+            q.appendDate("$gte", t.asDate());
+            BSONObjBuilder q2;
+            q2.append("ts", q.done());
+            query(ns, q2.done());
+        }
+        */
+
+        void tailingQuery(const char *ns, const BSONObj& query, const BSONObj* fields=0) {
+            assert( !haveCursor() );
+            log(2) << "repl: " << ns << ".find(" << query.toString() << ')' << endl;
+            cursor.reset( _conn->query( ns, query, 0, 0, fields,
+                                        QueryOption_CursorTailable | QueryOption_SlaveOk | QueryOption_OplogReplay |
+                                        /* TODO: slaveOk maybe shouldn't use? */
+                                        QueryOption_AwaitData
+                                        ).release() );
+        }
+
+        void tailingQueryGTE(const char *ns, OpTime t, const BSONObj* fields=0) {
+            BSONObjBuilder q;
+            q.appendDate("$gte", t.asDate());
+            BSONObjBuilder query;
+            query.append("ts", q.done());
+            tailingQuery(ns, query.done(), fields);
+        }
+
+        /* Do a tailing query, but only send the ts field back. */
+        void ghostQueryGTE(const char *ns, OpTime t) {
+            const BSONObj fields = BSON("ts" << 1 << "_id" << 0);
+            return tailingQueryGTE(ns, t, &fields);
+        }
+
+        bool more() {
+            uassert( 15910, "Doesn't have cursor for reading oplog", cursor.get() );
+            return cursor->more();
+        }
+
+        bool moreInCurrentBatch() {
+            uassert( 15911, "Doesn't have cursor for reading oplog", cursor.get() );
+            return cursor->moreInCurrentBatch();
+        }
+
+        /* old mongod's can't do the await flag... */
+        bool awaitCapable() {
+            return cursor->hasResultFlag(ResultFlag_AwaitCapable);
+        }
+
+        void peek(vector<BSONObj>& v, int n) {
+            if( cursor.get() )
+                cursor->peek(v,n);
+        }
+        BSONObj nextSafe() { return cursor->nextSafe(); }
+        BSONObj next() { return cursor->next(); }
+        void putBack(BSONObj op) { cursor->putBack(op); }
+
+    private:
+        bool commonConnect(const string& hostName);
+        bool passthroughHandshake(const BSONObj& rid, const int f);
+    };
+
+}
diff --git a/src/mongo/db/ops/count.cpp b/src/mongo/db/ops/count.cpp
new file mode 100644
index 00000000000..3c183596b9d
--- /dev/null
+++ b/src/mongo/db/ops/count.cpp
@@ -0,0 +1,103 @@
+// count.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "count.h"
+
+#include "../client.h"
+#include "../clientcursor.h"
+#include "../namespace.h"
+#include "../queryutil.h"
+
+namespace mongo {
+    
+    long long runCount( const char *ns, const BSONObj &cmd, string &err ) {
+        Client::Context cx(ns);
+        NamespaceDetails *d = nsdetails( ns );
+        if ( !d ) {
+            err = "ns missing";
+            return -1;
+        }
+        BSONObj query = cmd.getObjectField("query");
+        
+        // count of all objects
+        if ( query.isEmpty() ) {
+            return applySkipLimit( d->stats.nrecords , cmd );
+        }
+        
+        string exceptionInfo;
+        long long count = 0;
+        long long skip = cmd["skip"].numberLong();
+        long long limit = cmd["limit"].numberLong();
+        bool simpleEqualityMatch;
+        shared_ptr<Cursor> cursor = NamespaceDetailsTransient::getCursor( ns, query, BSONObj(), false, &simpleEqualityMatch );
+        ClientCursor::CleanupPointer ccPointer;
+        ElapsedTracker timeToStartYielding( 256, 20 );
+        try {
+            while( cursor->ok() ) {
+                if ( !ccPointer ) {
+                    if ( timeToStartYielding.intervalHasElapsed() ) {
+                        // Lazily construct a ClientCursor, avoiding a performance regression when scanning a very
+                        // small number of documents.
+                        ccPointer.reset( new ClientCursor( QueryOption_NoCursorTimeout, cursor, ns ) );
+                    }
+                }
+                else if ( !ccPointer->yieldSometimes( simpleEqualityMatch ? ClientCursor::DontNeed : ClientCursor::MaybeCovered ) ||
+                         !cursor->ok() ) {
+                    break;
+                }
+                
+                // With simple equality matching there is no need to use the matcher because the bounds
+                // are enforced by the FieldRangeVectorIterator and only key fields have constraints.  There
+                // is no need to do key deduping because an exact value is specified in the query for all key
+                // fields and duplicate keys are not allowed per document.
+                // NOTE In the distant past we used a min/max bounded BtreeCursor with a shallow
+                // equality comparison to check for matches in the simple match case.  That may be
+                // more performant, but I don't think we've measured the performance.
+                if ( simpleEqualityMatch ||
+                    ( cursor->currentMatches() && !cursor->getsetdup( cursor->currLoc() ) ) ) {
+                    
+                    if ( skip > 0 ) {
+                        --skip;
+                    }
+                    else {
+                        ++count;
+                        if ( limit > 0 && count >= limit ) {
+                            break;
+                        }
+                    }
+                }
+                cursor->advance();
+            }
+            ccPointer.reset();
+            return count;
+            
+        } catch ( const DBException &e ) {
+            exceptionInfo = e.toString();
+        } catch ( const std::exception &e ) {
+            exceptionInfo = e.what();
+        } catch ( ... ) {
+            exceptionInfo = "unknown exception";
+        }
+        // Historically we have returned zero in many count assertion cases - see SERVER-2291.
+        log() << "Count with ns: " << ns << " and query: " << query
+        << " failed with exception: " << exceptionInfo
+        << endl;
+        return 0;
+    }
+    
+} // namespace mongo
diff --git a/src/mongo/db/ops/count.h b/src/mongo/db/ops/count.h
new file mode 100644
index 00000000000..807741e1253
--- /dev/null
+++ b/src/mongo/db/ops/count.h
@@ -0,0 +1,30 @@
+// count.h
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../jsobj.h"
+#include "../diskloc.h"
+
+namespace mongo {
+    
+    /**
+     * { count: "collectionname"[, query: <query>] }
+     * @return -1 on ns does not exist error and other errors, 0 on other errors, otherwise the match count.
+     */
+    long long runCount(const char *ns, const BSONObj& cmd, string& err);
+    
+} // namespace mongo
diff --git a/src/mongo/db/ops/delete.cpp b/src/mongo/db/ops/delete.cpp
new file mode 100644
index 00000000000..e33611c151e
--- /dev/null
+++ b/src/mongo/db/ops/delete.cpp
@@ -0,0 +1,158 @@
+// delete.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "delete.h"
+#include "../queryoptimizer.h"
+#include "../oplog.h"
+
+namespace mongo {
+    
+    /* ns:      namespace, e.g. <database>.<collection>
+       pattern: the "where" clause / criteria
+       justOne: stop after 1 match
+       god:     allow access to system namespaces, and don't yield
+    */
+    long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) {
+        if( !god ) {
+            if ( strstr(ns, ".system.") ) {
+                /* note a delete from system.indexes would corrupt the db
+                if done here, as there are pointers into those objects in
+                NamespaceDetails.
+                */
+                uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) );
+            }
+            if ( strchr( ns , '$' ) ) {
+                log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
+                uassert( 10100 ,  "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 );
+            }
+        }
+
+        {
+            NamespaceDetails *d = nsdetails( ns );
+            if ( ! d )
+                return 0;
+            uassert( 10101 ,  "can't remove from a capped collection" , ! d->capped );
+        }
+
+        long long nDeleted = 0;
+
+        shared_ptr< Cursor > creal = NamespaceDetailsTransient::getCursor( ns, pattern, BSONObj(), false, 0 );
+
+        if( !creal->ok() )
+            return nDeleted;
+
+        shared_ptr< Cursor > cPtr = creal;
+        auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) );
+        cc->setDoingDeletes( true );
+
+        CursorId id = cc->cursorid();
+
+        bool justOne = justOneOrig;
+        bool canYield = !god && !(creal->matcher() && creal->matcher()->docMatcher().atomic());
+
+        do {
+            // TODO: we can generalize this I believe
+            //       
+            bool willNeedRecord = (creal->matcher() && creal->matcher()->needRecord()) || pattern.isEmpty() || isSimpleIdQuery( pattern );
+            if ( ! willNeedRecord ) {
+                // TODO: this is a total hack right now
+                // check if the index full encompasses query
+                
+                if ( pattern.nFields() == 1 && 
+                     str::equals( pattern.firstElement().fieldName() , creal->indexKeyPattern().firstElement().fieldName() ) )
+                    willNeedRecord = true;
+            }
+            
+            if ( canYield && ! cc->yieldSometimes( willNeedRecord ? ClientCursor::WillNeed : ClientCursor::MaybeCovered ) ) {
+                cc.release(); // has already been deleted elsewhere
+                // TODO should we assert or something?
+                break;
+            }
+            if ( !cc->ok() ) {
+                break; // if we yielded, could have hit the end
+            }
+
+            // this way we can avoid calling updateLocation() every time (expensive)
+            // as well as some other nuances handled
+            cc->setDoingDeletes( true );
+
+            DiskLoc rloc = cc->currLoc();
+            BSONObj key = cc->currKey();
+
+            bool match = creal->currentMatches();
+            bool dup = cc->c()->getsetdup(rloc);
+
+            if ( ! cc->advance() )
+                justOne = true;
+
+            if ( ! match )
+                continue;
+
+            assert( !dup ); // can't be a dup, we deleted it!
+
+            if ( !justOne ) {
+                /* NOTE: this is SLOW.  this is not good, noteLocation() was designed to be called across getMore
+                    blocks.  here we might call millions of times which would be bad.
+                    */
+                cc->c()->prepareToTouchEarlierIterate();
+            }
+
+            if ( logop ) {
+                BSONElement e;
+                if( BSONObj( rloc.rec() ).getObjectID( e ) ) {
+                    BSONObjBuilder b;
+                    b.append( e );
+                    bool replJustOne = true;
+                    logOp( "d", ns, b.done(), 0, &replJustOne );
+                }
+                else {
+                    problem() << "deleted object without id, not logging" << endl;
+                }
+            }
+
+            if ( rs )
+                rs->goingToDelete( rloc.obj() /*cc->c->current()*/ );
+
+            theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc);
+            nDeleted++;
+            if ( justOne ) {
+                break;
+            }
+            cc->c()->recoverFromTouchingEarlierIterate();
+         
+            if( !god ) 
+                getDur().commitIfNeeded();
+
+            if( debug && god && nDeleted == 100 ) 
+                log() << "warning high number of deletes with god=true which could use significant memory" << endl;
+        }
+        while ( cc->ok() );
+
+        if ( cc.get() && ClientCursor::find( id , false ) == 0 ) {
+            // TODO: remove this and the id declaration above if this doesn't trigger
+            //       if it does, then i'm very confused (ERH 06/2011)
+            error() << "this should be impossible" << endl;
+            printStackTrace();
+            cc.release();
+        }
+
+        return nDeleted;
+    }
+
+}
diff --git a/src/mongo/db/ops/delete.h b/src/mongo/db/ops/delete.h
new file mode 100644
index 00000000000..a74b7a664bc
--- /dev/null
+++ b/src/mongo/db/ops/delete.h
@@ -0,0 +1,33 @@
+// delete.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "../clientcursor.h"
+
+namespace mongo {
+
+    class RemoveSaver;
+
+    // If justOne is true, deletedId is set to the id of the deleted object.
+    long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop = false, bool god=false, RemoveSaver * rs=0);
+
+
+}
diff --git a/src/mongo/db/ops/query.cpp b/src/mongo/db/ops/query.cpp
new file mode 100644
index 00000000000..15e3ed9053f
--- /dev/null
+++ b/src/mongo/db/ops/query.cpp
@@ -0,0 +1,870 @@
+// query.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "query.h"
+#include "../pdfile.h"
+#include "../jsobjmanipulator.h"
+#include "../../bson/util/builder.h"
+#include <time.h>
+#include "../introspect.h"
+#include "../btree.h"
+#include "../../util/lruishmap.h"
+#include "../json.h"
+#include "../repl.h"
+#include "../replutil.h"
+#include "../scanandorder.h"
+#include "../security.h"
+#include "../curop-inl.h"
+#include "../commands.h"
+#include "../queryoptimizer.h"
+#include "../lasterror.h"
+#include "../../s/d_logic.h"
+#include "../repl_block.h"
+#include "../../server.h"
+#include "../d_concurrency.h"
+
+namespace mongo {
+
+    /* We cut off further objects once we cross this threshold; thus, you might get
+       a little bit more than this, it is a threshold rather than a limit.
+    */
+    const int MaxBytesToReturnToClientAtOnce = 4 * 1024 * 1024;
+
+    //ns->query->DiskLoc
+//    LRUishMap<BSONObj,DiskLoc,5> lrutest(123);
+
+    extern bool useCursors;
+    extern bool useHints;
+
+    bool runCommands(const char *ns, BSONObj& jsobj, CurOp& curop, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) {
+        try {
+            return _runCommands(ns, jsobj, b, anObjBuilder, fromRepl, queryOptions);
+        }
+        catch( SendStaleConfigException& ){
+            throw;
+        }
+        catch ( AssertionException& e ) {
+            assert( e.getCode() != SendStaleConfigCode && e.getCode() != RecvStaleConfigCode );
+
+            e.getInfo().append( anObjBuilder , "assertion" , "assertionCode" );
+            curop.debug().exceptionInfo = e.getInfo();
+        }
+        anObjBuilder.append("errmsg", "db assertion failure");
+        anObjBuilder.append("ok", 0.0);
+        BSONObj x = anObjBuilder.done();
+        b.appendBuf((void*) x.objdata(), x.objsize());
+        return true;
+    }
+
+
+    BSONObj id_obj = fromjson("{\"_id\":1}");
+    BSONObj empty_obj = fromjson("{}");
+
+
+    //int dump = 0;
+
+    /* empty result for error conditions */
+    QueryResult* emptyMoreResult(long long cursorid) {
+        BufBuilder b(32768);
+        b.skip(sizeof(QueryResult));
+        QueryResult *qr = (QueryResult *) b.buf();
+        qr->cursorId = 0; // 0 indicates no more data to retrieve.
+        qr->startingFrom = 0;
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->initializeResultFlags();
+        qr->nReturned = 0;
+        b.decouple();
+        return qr;
+    }
+
+    QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) {
+        exhaust = false;
+        ClientCursor::Pointer p(cursorid);
+        ClientCursor *cc = p.c();
+
+        int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;
+
+        BufBuilder b( bufSize );
+        b.skip(sizeof(QueryResult));
+        int resultFlags = ResultFlag_AwaitCapable;
+        int start = 0;
+        int n = 0;
+
+        if ( unlikely(!cc) ) {
+            LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl;
+            cursorid = 0;
+            resultFlags = ResultFlag_CursorNotFound;
+        }
+        else {
+            // check for spoofing of the ns such that it does not match the one originally there for the cursor
+            uassert(14833, "auth error", str::equals(ns, cc->ns().c_str()));
+
+            if ( pass == 0 )
+                cc->updateSlaveLocation( curop );
+
+            int queryOptions = cc->queryOptions();
+            
+            curop.debug().query = cc->query();
+
+            start = cc->pos();
+            Cursor *c = cc->c();
+            c->checkLocation();
+            DiskLoc last;
+
+            scoped_ptr<Projection::KeyOnly> keyFieldsOnly;
+            if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields )
+                keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) );
+
+            // This manager may be stale, but it's the state of chunking when the cursor was created.
+            ShardChunkManagerPtr manager = cc->getChunkManager();
+
+            while ( 1 ) {
+                if ( !c->ok() ) {
+                    if ( c->tailable() ) {
+                        /* when a tailable cursor hits "EOF", ok() goes false, and current() is null.  however
+                           advance() can still be retries as a reactivation attempt.  when there is new data, it will
+                           return true.  that's what we are doing here.
+                           */
+                        if ( c->advance() )
+                            continue;
+
+                        if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
+                            return 0;
+                        }
+
+                        break;
+                    }
+                    p.release();
+                    bool ok = ClientCursor::erase(cursorid);
+                    assert(ok);
+                    cursorid = 0;
+                    cc = 0;
+                    break;
+                }
+
+                // in some cases (clone collection) there won't be a matcher
+                if ( c->matcher() && !c->matcher()->matchesCurrent( c ) ) {
+                }
+                else if ( manager && ! manager->belongsToMe( cc ) ){
+                    LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl;
+                }
+                else {
+                    if( c->getsetdup(c->currLoc()) ) {
+                        //out() << "  but it's a dup \n";
+                    }
+                    else {
+                        last = c->currLoc();
+                        n++;
+
+                        if ( keyFieldsOnly ) {
+                            fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) );
+                        }
+                        else {
+                            BSONObj js = c->current();
+                            // show disk loc should be part of the main query, not in an $or clause, so this should be ok
+                            fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0));
+                        }
+
+                        if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
+                            c->advance();
+                            cc->incPos( n );
+                            break;
+                        }
+                    }
+                }
+                c->advance();
+
+                if ( ! cc->yieldSometimes( ClientCursor::MaybeCovered ) ) {
+                    ClientCursor::erase(cursorid);
+                    cursorid = 0;
+                    cc = 0;
+                    p.deleted();
+                    break;
+                }
+            }
+            
+            if ( cc ) {
+                cc->updateLocation();
+                cc->mayUpgradeStorage();
+                cc->storeOpForSlave( last );
+                exhaust = cc->queryOptions() & QueryOption_Exhaust;
+            }
+        }
+
+        QueryResult *qr = (QueryResult *) b.buf();
+        qr->len = b.len();
+        qr->setOperation(opReply);
+        qr->_resultFlags() = resultFlags;
+        qr->cursorId = cursorid;
+        qr->startingFrom = start;
+        qr->nReturned = n;
+        b.decouple();
+
+        return qr;
+    }
+
+    class ExplainBuilder {
+        // Note: by default we filter out allPlans and oldPlan in the shell's
+        // explain() function. If you add any recursive structures, make sure to
+        // edit the JS to make sure everything gets filtered.
+    public:
+        ExplainBuilder() : _i() {}
+        void ensureStartScan() {
+            if ( !_a.get() ) {
+                _a.reset( new BSONArrayBuilder() );
+            }
+        }
+        void noteCursor( Cursor *c ) {
+            BSONObjBuilder b( _a->subobjStart() );
+            b << "cursor" << c->toString() << "indexBounds" << c->prettyIndexBounds();
+            b.done();
+        }
+        void noteScan( Cursor *c, long long nscanned, long long nscannedObjects, int n, bool scanAndOrder,
+                       int millis, bool hint, int nYields , int nChunkSkips , bool indexOnly ) {
+            if ( _i == 1 ) {
+                _c.reset( new BSONArrayBuilder() );
+                *_c << _b->obj();
+            }
+            if ( _i == 0 ) {
+                _b.reset( new BSONObjBuilder() );
+            }
+            else {
+                _b.reset( new BSONObjBuilder( _c->subobjStart() ) );
+            }
+            *_b << "cursor" << c->toString();
+            _b->appendNumber( "nscanned", nscanned );
+            _b->appendNumber( "nscannedObjects", nscannedObjects );
+            *_b << "n" << n;
+
+            if ( scanAndOrder )
+                *_b << "scanAndOrder" << true;
+
+            *_b << "millis" << millis;
+
+            *_b << "nYields" << nYields;
+            *_b << "nChunkSkips" << nChunkSkips;
+            *_b << "isMultiKey" << c->isMultiKey();
+            *_b << "indexOnly" << indexOnly;
+
+            *_b << "indexBounds" << c->prettyIndexBounds();
+
+            c->explainDetails( *_b );
+
+            if ( !hint ) {
+                *_b << "allPlans" << _a->arr();
+            }
+            if ( _i != 0 ) {
+                _b->done();
+            }
+            _a.reset( 0 );
+            ++_i;
+        }
+        BSONObj finishWithSuffix( long long nscanned, long long nscannedObjects, int n, int millis, const BSONObj &suffix ) {
+            if ( _i > 1 ) {
+                BSONObjBuilder b;
+                b << "clauses" << _c->arr();
+                b.appendNumber( "nscanned", nscanned );
+                b.appendNumber( "nscannedObjects", nscannedObjects );
+                b << "n" << n;
+                b << "millis" << millis;
+                b.appendElements( suffix );
+                return b.obj();
+            }
+            else {
+            	stringstream host;
+            	host << getHostNameCached() << ":" << cmdLine.port;
+            	*_b << "server" << host.str();
+                _b->appendElements( suffix );
+                return _b->obj();
+            }
+        }
+    private:
+        auto_ptr< BSONArrayBuilder > _a;
+        auto_ptr< BSONObjBuilder > _b;
+        auto_ptr< BSONArrayBuilder > _c;
+        int _i;
+    };
+
+    // Implements database 'query' requests using the query optimizer's QueryOp interface
+    class UserQueryOp : public QueryOp {
+    public:
+
+        UserQueryOp( const ParsedQuery& pq, Message &response, ExplainBuilder &eb, CurOp &curop ) :
+            _buf( 32768 ) , // TODO be smarter here
+            _pq( pq ) ,
+            _ntoskip( pq.getSkip() ) ,
+            _nscanned(0), _oldNscanned(0), _nscannedObjects(0), _oldNscannedObjects(0),
+            _n(0),
+            _oldN(0),
+            _nYields(),
+            _nChunkSkips(),
+            _chunkManager( shardingState.needShardChunkManager(pq.ns()) ?
+                           shardingState.getShardChunkManager(pq.ns()) : ShardChunkManagerPtr() ),
+            _inMemSort(false),
+            _capped(false),
+            _saveClientCursor(false),
+            _wouldSaveClientCursor(false),
+            _oplogReplay( pq.hasOption( QueryOption_OplogReplay) ),
+            _response( response ),
+            _eb( eb ),
+            _curop( curop ),
+            _yieldRecoveryFailed()
+        {}
+
+        virtual void _init() {
+            // only need to put the QueryResult fields there if we're building the first buffer in the message.
+            if ( _response.empty() ) {
+                _buf.skip( sizeof( QueryResult ) );
+            }
+
+            if ( _oplogReplay ) {
+                _findingStartCursor.reset( new FindingStartCursor( qp() ) );
+                _capped = true;
+            }
+            else {
+                _c = qp().newCursor( DiskLoc() , _pq.getNumToReturn() + _pq.getSkip() );
+                _capped = _c->capped();
+
+                // setup check for if we can only use index to extract
+                if ( _c->modifiedKeys() == false && _c->isMultiKey() == false && _pq.getFields() ) {
+                    _keyFieldsOnly.reset( _pq.getFields()->checkKey( _c->indexKeyPattern() ) );
+                }
+            }
+
+            if ( qp().scanAndOrderRequired() ) {
+                _inMemSort = true;
+                _so.reset( new ScanAndOrder( _pq.getSkip() , _pq.getNumToReturn() , _pq.getOrder(), qp().multikeyFrs() ) );
+            }
+
+            if ( _pq.isExplain() ) {
+                _eb.noteCursor( _c.get() );
+            }
+
+        }
+
+        virtual bool prepareToYield() {
+            if ( _findingStartCursor.get() ) {
+                return _findingStartCursor->prepareToYield();
+            }
+            else {
+                if ( _c && !_cc ) {
+                    _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _pq.ns() ) );
+                }
+                if ( _cc ) {
+	                return _cc->prepareToYield( _yieldData );
+                }
+            }
+            // no active cursor - ok to yield
+            return true;
+        }
+
+        virtual void recoverFromYield() {
+            _nYields++;
+
+            if ( _findingStartCursor.get() ) {
+                _findingStartCursor->recoverFromYield();
+            }
+            else if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) {
+                _yieldRecoveryFailed = true;
+                _c.reset();
+                _cc.reset();
+                _so.reset();
+
+                if ( _capped ) {
+                    msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun during query: " << _pq.ns() );
+                }
+                else if ( qp().mustAssertOnYieldFailure() ) {
+                    msgassertedNoTrace( 15890, str::stream() << "UserQueryOp::recoverFromYield() failed to recover: " << _pq.ns() );
+                }
+                else {
+                    // we don't fail query since we're fine with returning partial data if collection dropped
+
+                    // todo: this is wrong.  the cursor could be gone if closeAllDatabases command just ran
+                }
+
+            }
+        }
+
+        virtual long long nscanned() {
+            if ( _findingStartCursor.get() ) {
+                return 0; // should only be one query plan, so value doesn't really matter.
+            }
+            return _c.get() ? _c->nscanned() : _nscanned;
+        }
+
+        virtual void next() {
+            if ( _findingStartCursor.get() ) {
+                if ( !_findingStartCursor->done() ) {
+                    _findingStartCursor->next();
+                }                    
+                if ( _findingStartCursor->done() ) {
+                    _c = _findingStartCursor->cursor();
+                    _findingStartCursor.reset( 0 );
+                }
+                _capped = true;
+                return;
+            }
+
+            if ( !_c || !_c->ok() ) {
+                finish( false );
+                return;
+            }
+
+            bool mayCreateCursor1 = _pq.wantMore() && ! _inMemSort && _pq.getNumToReturn() != 1 && useCursors;
+
+            if( 0 ) {
+                cout << "SCANNING this: " << this << " key: " << _c->currKey() << " obj: " << _c->current() << endl;
+            }
+
+            if ( _pq.getMaxScan() && _nscanned >= _pq.getMaxScan() ) {
+                finish( true ); //?
+                return;
+            }
+
+            _nscanned = _c->nscanned();
+            if ( !matcher( _c )->matchesCurrent(_c.get() , &_details ) ) {
+                // not a match, continue onward
+                if ( _details._loadedObject )
+                    _nscannedObjects++;
+            }
+            else {
+                _nscannedObjects++;
+                DiskLoc cl = _c->currLoc();
+                if ( _chunkManager && ! _chunkManager->belongsToMe( cl.obj() ) ) { // TODO: should make this covered at some point
+                    _nChunkSkips++;
+                    // log() << "TEMP skipping un-owned chunk: " << _c->current() << endl;
+                }
+                else if( _c->getsetdup(cl) ) {
+                    // dup
+                }
+                else {
+                    // got a match.
+
+                    if ( _inMemSort ) {
+                        // note: no cursors for non-indexed, ordered results.  results must be fairly small.
+                        _so->add( _pq.returnKey() ? _c->currKey() : _c->current(), _pq.showDiskLoc() ? &cl : 0 );
+                    }
+                    else if ( _ntoskip > 0 ) {
+                        _ntoskip--;
+                    }
+                    else {
+                        if ( _pq.isExplain() ) {
+                            _n++;
+                            if ( n() >= _pq.getNumToReturn() && !_pq.wantMore() ) {
+                                // .limit() was used, show just that much.
+                                finish( true ); //?
+                                return;
+                            }
+                        }
+                        else {
+
+                            if ( _pq.returnKey() ) {
+                                BSONObjBuilder bb( _buf );
+                                bb.appendKeys( _c->indexKeyPattern() , _c->currKey() );
+                                bb.done();
+                            }
+                            else if ( _keyFieldsOnly ) {
+                                fillQueryResultFromObj( _buf , 0 , _keyFieldsOnly->hydrate( _c->currKey() ) );
+                            }
+                            else {
+                                BSONObj js = _c->current();
+                                assert( js.isValid() );
+
+                                if ( _oplogReplay ) {
+                                    BSONElement e = js["ts"];
+                                    if ( e.type() == Date || e.type() == Timestamp )
+                                        _slaveReadTill = e._opTime();
+                                }
+
+                                fillQueryResultFromObj( _buf , _pq.getFields() , js , (_pq.showDiskLoc() ? &cl : 0));
+                            }
+                            _n++;
+                            if ( ! _c->supportGetMore() ) {
+                                if ( _pq.enough( n() ) || _buf.len() >= MaxBytesToReturnToClientAtOnce ) {
+                                    finish( true );
+                                    return;
+                                }
+                            }
+                            else if ( _pq.enoughForFirstBatch( n() , _buf.len() ) ) {
+                                /* if only 1 requested, no cursor saved for efficiency...we assume it is findOne() */
+                                if ( mayCreateCursor1 ) {
+                                    _wouldSaveClientCursor = true;
+                                    if ( _c->advance() ) {
+                                        // more...so save a cursor
+                                        _saveClientCursor = true;
+                                    }
+                                }
+                                finish( true );
+                                return;
+                            }
+                        }
+                    }
+                }
+            }
+            _c->advance();
+        }
+
+        // this plan won, so set data for response broadly
+        void finish( bool stop ) {
+            massert( 13638, "client cursor dropped during explain query yield", !_pq.isExplain() || _c.get() );
+
+            if ( _pq.isExplain() ) {
+                _n = _inMemSort ? _so->size() : _n;
+            }
+            else if ( _inMemSort ) {
+                if( _so.get() )
+                    _so->fill( _buf, _pq.getFields() , _n );
+            }
+
+            if ( _c.get() ) {
+                _nscanned = _c->nscanned();
+
+                if ( _pq.hasOption( QueryOption_CursorTailable ) && _pq.getNumToReturn() != 1 )
+                    _c->setTailable();
+
+                // If the tailing request succeeded.
+                if ( _c->tailable() )
+                    _saveClientCursor = true;
+            }
+
+            if ( _pq.isExplain() ) {
+                _eb.noteScan( _c.get(), _nscanned, _nscannedObjects, _n, scanAndOrderRequired(),
+                              _curop.elapsedMillis(), useHints && !_pq.getHint().eoo(), _nYields ,
+                              _nChunkSkips, _keyFieldsOnly.get() > 0 );
+            }
+            else {
+                if ( _buf.len() ) {
+                    _response.appendData( _buf.buf(), _buf.len() );
+                    _buf.decouple();
+                }
+            }
+
+            if ( stop ) {
+                setStop();
+            }
+            else {
+                setComplete();
+            }
+
+        }
+
+        void finishExplain( const BSONObj &suffix ) {
+            BSONObj obj = _eb.finishWithSuffix( totalNscanned(), nscannedObjects(), n(), _curop.elapsedMillis(), suffix);
+            fillQueryResultFromObj(_buf, 0, obj);
+            _n = 1;
+            _oldN = 0;
+            _response.appendData( _buf.buf(), _buf.len() );
+            _buf.decouple();
+        }
+
+        virtual bool mayRecordPlan() const {
+            return !_yieldRecoveryFailed && ( _pq.getNumToReturn() != 1 ) && ( ( _n > _pq.getNumToReturn() / 2 ) || ( complete() && !stopRequested() ) );
+        }
+
+        virtual QueryOp *_createChild() const {
+            if ( _pq.isExplain() ) {
+                _eb.ensureStartScan();
+            }
+            UserQueryOp *ret = new UserQueryOp( _pq, _response, _eb, _curop );
+            ret->_oldN = n();
+            ret->_oldNscanned = totalNscanned();
+            ret->_oldNscannedObjects = nscannedObjects();
+            ret->_ntoskip = _ntoskip;
+            return ret;
+        }
+
+        bool scanAndOrderRequired() const { return _inMemSort; }
+        shared_ptr<Cursor> cursor() { return _c; }
+        int n() const { return _oldN + _n; }
+        long long totalNscanned() const { return _nscanned + _oldNscanned; }
+        long long nscannedObjects() const { return _nscannedObjects + _oldNscannedObjects; }
+        bool saveClientCursor() const { return _saveClientCursor; }
+        bool wouldSaveClientCursor() const { return _wouldSaveClientCursor; }
+
+        void finishForOplogReplay( ClientCursor * cc ) {
+            if ( _oplogReplay && ! _slaveReadTill.isNull() )
+                cc->slaveReadTill( _slaveReadTill );
+
+        }
+
+        ShardChunkManagerPtr getChunkManager(){ return _chunkManager; }
+
+    private:
+        BufBuilder _buf;
+        const ParsedQuery& _pq;
+        scoped_ptr<Projection::KeyOnly> _keyFieldsOnly;
+
+        long long _ntoskip;
+        long long _nscanned;
+        long long _oldNscanned;
+        long long _nscannedObjects;
+        long long _oldNscannedObjects;
+        int _n; // found so far
+        int _oldN;
+
+        int _nYields;
+        int _nChunkSkips;
+
+        MatchDetails _details;
+
+        ShardChunkManagerPtr _chunkManager;
+
+        bool _inMemSort;
+        auto_ptr< ScanAndOrder > _so;
+
+        shared_ptr<Cursor> _c;
+        ClientCursor::CleanupPointer _cc;
+        ClientCursor::YieldData _yieldData;
+
+        bool _capped;
+        bool _saveClientCursor;
+        bool _wouldSaveClientCursor;
+        bool _oplogReplay;
+        auto_ptr< FindingStartCursor > _findingStartCursor;
+
+        Message &_response;
+        ExplainBuilder &_eb;
+        CurOp &_curop;
+        OpTime _slaveReadTill;
+
+        bool _yieldRecoveryFailed;
+    };
+
+    /* run a query -- includes checking for and running a Command \
+       @return points to ns if exhaust mode. 0=normal mode
+    */
+    const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
+        shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
+        ParsedQuery& pq( *pq_shared );
+        int ntoskip = q.ntoskip;
+        BSONObj jsobj = q.query;
+        int queryOptions = q.queryOptions;
+        const char *ns = q.ns;
+
+        if( logLevel >= 2 )
+            log() << "runQuery called " << ns << " " << jsobj << endl;
+
+        curop.debug().ns = ns;
+        curop.debug().ntoreturn = pq.getNumToReturn();
+        curop.setQuery(jsobj);
+
+        if ( pq.couldBeCommand() ) {
+            BufBuilder bb;
+            bb.skip(sizeof(QueryResult));
+            BSONObjBuilder cmdResBuf;
+            if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
+                curop.debug().iscommand = true;
+                curop.debug().query = jsobj;
+                curop.markCommand();
+
+                auto_ptr< QueryResult > qr;
+                qr.reset( (QueryResult *) bb.buf() );
+                bb.decouple();
+                qr->setResultFlagsToOk();
+                qr->len = bb.len();
+                curop.debug().responseLength = bb.len();
+                qr->setOperation(opReply);
+                qr->cursorId = 0;
+                qr->startingFrom = 0;
+                qr->nReturned = 1;
+                result.setData( qr.release(), true );
+            }
+            else {
+                uasserted(13530, "bad or malformed command request?");
+            }
+            return 0;
+        }
+
+        /* --- regular query --- */
+
+        int n = 0;
+        BSONElement hint = useHints ? pq.getHint() : BSONElement();
+        bool explain = pq.isExplain();
+        bool snapshot = pq.isSnapshot();
+        BSONObj order = pq.getOrder();
+        BSONObj query = pq.getFilter();
+
+        /* The ElemIter will not be happy if this isn't really an object. So throw exception
+           here when that is true.
+           (Which may indicate bad data from client.)
+        */
+        if ( query.objsize() == 0 ) {
+            out() << "Bad query object?\n  jsobj:";
+            out() << jsobj.toString() << "\n  query:";
+            out() << query.toString() << endl;
+            uassert( 10110 , "bad query object", false);
+        }
+
+        Client::ReadContext ctx( ns , dbpath ); // read locks
+
+        replVerifyReadsOk(pq);
+
+        if ( pq.hasOption( QueryOption_CursorTailable ) ) {
+            NamespaceDetails *d = nsdetails( ns );
+            uassert( 13051, "tailable cursor requested on non capped collection", d && d->capped );
+            const BSONObj nat1 = BSON( "$natural" << 1 );
+            if ( order.isEmpty() ) {
+                order = nat1;
+            }
+            else {
+                uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 );
+            }
+        }
+
+        BSONObj snapshotHint; // put here to keep the data in scope
+        if( snapshot ) {
+            NamespaceDetails *d = nsdetails(ns);
+            if ( d ) {
+                int i = d->findIdIndex();
+                if( i < 0 ) {
+                    if ( strstr( ns , ".system." ) == 0 )
+                        log() << "warning: no _id index on $snapshot query, ns:" << ns << endl;
+                }
+                else {
+                    /* [dm] the name of an _id index tends to vary, so we build the hint the hard way here.
+                       probably need a better way to specify "use the _id index" as a hint.  if someone is
+                       in the query optimizer please fix this then!
+                    */
+                    BSONObjBuilder b;
+                    b.append("$hint", d->idx(i).indexName());
+                    snapshotHint = b.obj();
+                    hint = snapshotHint.firstElement();
+                }
+            }
+        }
+
+        if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {
+
+            bool nsFound = false;
+            bool indexFound = false;
+
+            BSONObj resObject;
+            Client& c = cc();
+            bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
+            if ( nsFound == false || indexFound == true ) {
+                BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
+                bb.skip(sizeof(QueryResult));
+                
+                curop.debug().idhack = true;
+                if ( found ) {
+                    n = 1;
+                    fillQueryResultFromObj( bb , pq.getFields() , resObject );
+                }
+                auto_ptr< QueryResult > qr;
+                qr.reset( (QueryResult *) bb.buf() );
+                bb.decouple();
+                qr->setResultFlagsToOk();
+                qr->len = bb.len();
+                
+                curop.debug().responseLength = bb.len();
+                qr->setOperation(opReply);
+                qr->cursorId = 0;
+                qr->startingFrom = 0;
+                qr->nReturned = n;
+                result.setData( qr.release(), true );
+                return NULL;
+            }
+        }
+
+        // regular, not QO bypass query
+
+        BSONObj oldPlan;
+        if ( explain && ! pq.hasIndexSpecifier() ) {
+            MultiPlanScanner mps( ns, query, order );
+            if ( mps.usingCachedPlan() )
+                oldPlan = mps.oldExplain();
+        }
+        auto_ptr< MultiPlanScanner > mps( new MultiPlanScanner( ns, query, order, &hint, !explain, pq.getMin(), pq.getMax(), false, true ) );
+        BSONObj explainSuffix;
+        if ( explain ) {
+            BSONObjBuilder bb;
+            if ( !oldPlan.isEmpty() )
+                bb.append( "oldPlan", oldPlan.firstElement().embeddedObject().firstElement().embeddedObject() );
+            explainSuffix = bb.obj();
+        }
+        ExplainBuilder eb;
+        UserQueryOp original( pq, result, eb, curop );
+        shared_ptr< UserQueryOp > o = mps->runOp( original );
+        UserQueryOp &dqo = *o;
+        if ( ! dqo.complete() )
+            throw MsgAssertionException( dqo.exception() );
+        if ( explain ) {
+            dqo.finishExplain( explainSuffix );
+        }
+        n = dqo.n();
+        long long nscanned = dqo.totalNscanned();
+        curop.debug().scanAndOrder = dqo.scanAndOrderRequired();
+
+        shared_ptr<Cursor> cursor = dqo.cursor();
+        if( logLevel >= 5 )
+            log() << "   used cursor: " << cursor.get() << endl;
+        long long cursorid = 0;
+        const char * exhaust = 0;
+        if ( dqo.saveClientCursor() || ( dqo.wouldSaveClientCursor() && mps->mayRunMore() ) ) {
+            ClientCursor *cc;
+            bool moreClauses = mps->mayRunMore();
+            if ( moreClauses ) {
+                // this MultiCursor will use a dumb NoOp to advance(), so no need to specify mayYield
+                shared_ptr< Cursor > multi( new MultiCursor( mps, cursor, dqo.matcher( cursor ), dqo ) );
+                cc = new ClientCursor(queryOptions, multi, ns, jsobj.getOwned());
+            }
+            else {
+                if( ! cursor->matcher() ) cursor->setMatcher( dqo.matcher( cursor ) );
+                cc = new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned() );
+            }
+
+            cc->setChunkManager( dqo.getChunkManager() );
+
+            cursorid = cc->cursorid();
+            DEV tlog(2) << "query has more, cursorid: " << cursorid << endl;
+            cc->setPos( n );
+            cc->pq = pq_shared;
+            cc->fields = pq.getFieldPtr();
+            cc->originalMessage = m;
+            cc->updateLocation();
+            if ( !cc->ok() && cc->c()->tailable() )
+                DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl;
+            if( queryOptions & QueryOption_Exhaust ) {
+                exhaust = ns;
+                curop.debug().exhaust = true;
+            }
+            dqo.finishForOplogReplay(cc);
+        }
+
+        QueryResult *qr = (QueryResult *) result.header();
+        qr->cursorId = cursorid;
+        qr->setResultFlagsToOk();
+        // qr->len is updated automatically by appendData()
+        curop.debug().responseLength = qr->len;
+        qr->setOperation(opReply);
+        qr->startingFrom = 0;
+        qr->nReturned = n;
+
+        int duration = curop.elapsedMillis();
+        bool dbprofile = curop.shouldDBProfile( duration );
+        if ( dbprofile || duration >= cmdLine.slowMS ) {
+            curop.debug().nscanned = (int) nscanned;
+            curop.debug().ntoskip = ntoskip;
+        }
+        curop.debug().nreturned = n;
+        return exhaust;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/ops/query.h b/src/mongo/db/ops/query.h
new file mode 100644
index 00000000000..3324b75fe16
--- /dev/null
+++ b/src/mongo/db/ops/query.h
@@ -0,0 +1,248 @@
+// query.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../pch.h"
+#include "../../util/net/message.h"
+#include "../dbmessage.h"
+#include "../jsobj.h"
+#include "../diskloc.h"
+#include "../projection.h"
+
+// struct QueryOptions, QueryResult, QueryResultFlags in:
+#include "../../client/dbclient.h"
+
+namespace mongo {
+
+    extern const int MaxBytesToReturnToClientAtOnce;
+
+    QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& op, int pass, bool& exhaust);
+
+    const char * runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result);
+
+    /* This is for languages whose "objects" are not well ordered (JSON is well ordered).
+       [ { a : ... } , { b : ... } ] -> { a : ..., b : ... }
+    */
+    inline BSONObj transformOrderFromArrayFormat(BSONObj order) {
+        /* note: this is slow, but that is ok as order will have very few pieces */
+        BSONObjBuilder b;
+        char p[2] = "0";
+
+        while ( 1 ) {
+            BSONObj j = order.getObjectField(p);
+            if ( j.isEmpty() )
+                break;
+            BSONElement e = j.firstElement();
+            uassert( 10102 , "bad order array", !e.eoo());
+            uassert( 10103 , "bad order array [2]", e.isNumber());
+            b.append(e);
+            (*p)++;
+            uassert( 10104 , "too many ordering elements", *p <= '9');
+        }
+
+        return b.obj();
+    }
+
+    /**
+     * this represents a total user query
+     * includes fields from the query message, both possible query levels
+     * parses everything up front
+     */
+    class ParsedQuery : boost::noncopyable {
+    public:
+        ParsedQuery( QueryMessage& qm )
+            : _ns( qm.ns ) , _ntoskip( qm.ntoskip ) , _ntoreturn( qm.ntoreturn ) , _options( qm.queryOptions ) {
+            init( qm.query );
+            initFields( qm.fields );
+        }
+        ParsedQuery( const char* ns , int ntoskip , int ntoreturn , int queryoptions , const BSONObj& query , const BSONObj& fields )
+            : _ns( ns ) , _ntoskip( ntoskip ) , _ntoreturn( ntoreturn ) , _options( queryoptions ) {
+            init( query );
+            initFields( fields );
+        }
+
+        const char * ns() const { return _ns; }
+        bool isLocalDB() const { return strncmp(_ns, "local.", 6) == 0; }
+
+        const BSONObj& getFilter() const { return _filter; }
+        Projection* getFields() const { return _fields.get(); }
+        shared_ptr<Projection> getFieldPtr() const { return _fields; }
+
+        int getSkip() const { return _ntoskip; }
+        int getNumToReturn() const { return _ntoreturn; }
+        bool wantMore() const { return _wantMore; }
+        int getOptions() const { return _options; }
+        bool hasOption( int x ) const { return x & _options; }
+
+        bool isExplain() const { return _explain; }
+        bool isSnapshot() const { return _snapshot; }
+        bool returnKey() const { return _returnKey; }
+        bool showDiskLoc() const { return _showDiskLoc; }
+
+        const BSONObj& getMin() const { return _min; }
+        const BSONObj& getMax() const { return _max; }
+        const BSONObj& getOrder() const { return _order; }
+        const BSONElement& getHint() const { return _hint; }
+        int getMaxScan() const { return _maxScan; }
+
+        bool couldBeCommand() const {
+            /* we assume you are using findOne() for running a cmd... */
+            return _ntoreturn == 1 && strstr( _ns , ".$cmd" );
+        }
+
+        bool hasIndexSpecifier() const {
+            return ! _hint.eoo() || ! _min.isEmpty() || ! _max.isEmpty();
+        }
+
+        /* if ntoreturn is zero, we return up to 101 objects.  on the subsequent getmore, there
+           is only a size limit.  The idea is that on a find() where one doesn't use much results,
+           we don't return much, but once getmore kicks in, we start pushing significant quantities.
+
+           The n limit (vs. size) is important when someone fetches only one small field from big
+           objects, which causes massive scanning server-side.
+        */
+        bool enoughForFirstBatch( int n , int len ) const {
+            if ( _ntoreturn == 0 )
+                return ( len > 1024 * 1024 ) || n >= 101;
+            return n >= _ntoreturn || len > MaxBytesToReturnToClientAtOnce;
+        }
+
+        bool enough( int n ) const {
+            if ( _ntoreturn == 0 )
+                return false;
+            return n >= _ntoreturn;
+        }
+
+    private:
+        void init( const BSONObj& q ) {
+            _reset();
+            uassert( 10105 , "bad skip value in query", _ntoskip >= 0);
+
+            if ( _ntoreturn < 0 ) {
+                /* _ntoreturn greater than zero is simply a hint on how many objects to send back per
+                   "cursor batch".
+                   A negative number indicates a hard limit.
+                */
+                _wantMore = false;
+                _ntoreturn = -_ntoreturn;
+            }
+
+
+            BSONElement e = q["query"];
+            if ( ! e.isABSONObj() )
+                e = q["$query"];
+
+            if ( e.isABSONObj() ) {
+                _filter = e.embeddedObject();
+                _initTop( q );
+            }
+            else {
+                _filter = q;
+            }
+        }
+
+        void _reset() {
+            _wantMore = true;
+            _explain = false;
+            _snapshot = false;
+            _returnKey = false;
+            _showDiskLoc = false;
+            _maxScan = 0;
+        }
+
+        void _initTop( const BSONObj& top ) {
+            BSONObjIterator i( top );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                const char * name = e.fieldName();
+
+                if ( strcmp( "$orderby" , name ) == 0 ||
+                        strcmp( "orderby" , name ) == 0 ) {
+                    if ( e.type() == Object ) {
+                        _order = e.embeddedObject();
+                    }
+                    else if ( e.type() == Array ) {
+                        _order = transformOrderFromArrayFormat( _order );
+                    }
+                    else {
+                        uasserted(13513, "sort must be an object or array");
+                    }
+                    continue;
+                }
+
+                if( *name == '$' ) {
+                    name++;
+                    if ( strcmp( "explain" , name ) == 0 )
+                        _explain = e.trueValue();
+                    else if ( strcmp( "snapshot" , name ) == 0 )
+                        _snapshot = e.trueValue();
+                    else if ( strcmp( "min" , name ) == 0 )
+                        _min = e.embeddedObject();
+                    else if ( strcmp( "max" , name ) == 0 )
+                        _max = e.embeddedObject();
+                    else if ( strcmp( "hint" , name ) == 0 )
+                        _hint = e;
+                    else if ( strcmp( "returnKey" , name ) == 0 )
+                        _returnKey = e.trueValue();
+                    else if ( strcmp( "maxScan" , name ) == 0 )
+                        _maxScan = e.numberInt();
+                    else if ( strcmp( "showDiskLoc" , name ) == 0 )
+                        _showDiskLoc = e.trueValue();
+                    else if ( strcmp( "comment" , name ) == 0 ) {
+                        ; // no-op
+                    }
+                }
+            }
+
+            if ( _snapshot ) {
+                uassert( 12001 , "E12001 can't sort with $snapshot", _order.isEmpty() );
+                uassert( 12002 , "E12002 can't use hint with $snapshot", _hint.eoo() );
+            }
+
+        }
+
+        void initFields( const BSONObj& fields ) {
+            if ( fields.isEmpty() )
+                return;
+            _fields.reset( new Projection() );
+            _fields->init( fields );
+        }
+
+        const char * const _ns;
+        const int _ntoskip;
+        int _ntoreturn;
+        BSONObj _filter;
+        BSONObj _order;
+        const int _options;
+        shared_ptr< Projection > _fields;
+        bool _wantMore;
+        bool _explain;
+        bool _snapshot;
+        bool _returnKey;
+        bool _showDiskLoc;
+        BSONObj _min;
+        BSONObj _max;
+        BSONElement _hint;
+        int _maxScan;
+    };
+
+
+} // namespace mongo
+
+
diff --git a/src/mongo/db/ops/update.cpp b/src/mongo/db/ops/update.cpp
new file mode 100644
index 00000000000..2abc6987218
--- /dev/null
+++ b/src/mongo/db/ops/update.cpp
@@ -0,0 +1,1308 @@
+// update.cpp
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "query.h"
+#include "../pdfile.h"
+#include "../jsobjmanipulator.h"
+#include "../queryoptimizer.h"
+#include "../repl.h"
+#include "../btree.h"
+#include "../../util/stringutils.h"
+#include "update.h"
+
+//#define DEBUGUPDATE(x) cout << x << endl;
+#define DEBUGUPDATE(x)
+
+namespace mongo {
+
+    const char* Mod::modNames[] = { "$inc", "$set", "$push", "$pushAll", "$pull", "$pullAll" , "$pop", "$unset" ,
+                                    "$bitand" , "$bitor" , "$bit" , "$addToSet", "$rename", "$rename"
+                                  };
+    unsigned Mod::modNamesNum = sizeof(Mod::modNames)/sizeof(char*);
+
+    bool Mod::_pullElementMatch( BSONElement& toMatch ) const {
+
+        if ( elt.type() != Object ) {
+            // if elt isn't an object, then comparison will work
+            return toMatch.woCompare( elt , false ) == 0;
+        }
+
+        if ( matcherOnPrimitive )
+            return matcher->matches( toMatch.wrap( "" ) );
+
+        if ( toMatch.type() != Object ) {
+            // looking for an object, so this can't match
+            return false;
+        }
+
+        // now we have an object on both sides
+        return matcher->matches( toMatch.embeddedObject() );
+    }
+
+    template< class Builder >
+    void Mod::appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const {
+        BSONType a = in.type();
+        BSONType b = elt.type();
+
+        if ( a == NumberDouble || b == NumberDouble ) {
+            ms.incType = NumberDouble;
+            ms.incdouble = elt.numberDouble() + in.numberDouble();
+        }
+        else if ( a == NumberLong || b == NumberLong ) {
+            ms.incType = NumberLong;
+            ms.inclong = elt.numberLong() + in.numberLong();
+        }
+        else {
+            int x = elt.numberInt() + in.numberInt();
+            if ( x < 0 && elt.numberInt() > 0 && in.numberInt() > 0 ) {
+                // overflow
+                ms.incType = NumberLong;
+                ms.inclong = elt.numberLong() + in.numberLong();
+            }
+            else {
+                ms.incType = NumberInt;
+                ms.incint = elt.numberInt() + in.numberInt();
+            }
+        }
+
+        ms.appendIncValue( bb , false );
+    }
+
+    template< class Builder >
+    void appendUnset( Builder &b ) {
+    }
+
+    template<>
+    void appendUnset( BSONArrayBuilder &b ) {
+        b.appendNull();
+    }
+
+    template< class Builder >
+    void Mod::apply( Builder& b , BSONElement in , ModState& ms ) const {
+        if ( ms.dontApply ) {
+            return;
+        }
+
+        switch ( op ) {
+
+        case INC: {
+            appendIncremented( b , in , ms );
+            break;
+        }
+
+        case SET: {
+            _checkForAppending( elt );
+            b.appendAs( elt , shortFieldName );
+            break;
+        }
+
+        case UNSET: {
+            appendUnset( b );
+            break;
+        }
+
+        case PUSH: {
+            uassert( 10131 ,  "$push can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+            BSONObjIterator i( in.embeddedObject() );
+            int n=0;
+            while ( i.more() ) {
+                bb.append( i.next() );
+                n++;
+            }
+
+            ms.pushStartSize = n;
+
+            bb.appendAs( elt ,  bb.numStr( n ) );
+            bb.done();
+            break;
+        }
+
+        case ADDTOSET: {
+            uassert( 12592 ,  "$addToSet can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+            BSONObjIterator i( in.embeddedObject() );
+            int n=0;
+
+            if ( isEach() ) {
+
+                BSONElementSet toadd;
+                parseEach( toadd );
+
+                while ( i.more() ) {
+                    BSONElement cur = i.next();
+                    bb.append( cur );
+                    n++;
+                    toadd.erase( cur );
+                }
+
+                {
+                    BSONObjIterator i( getEach() );
+                    while ( i.more() ) {
+                        BSONElement e = i.next();
+                        if ( toadd.count(e) ) {
+                            bb.appendAs( e , BSONObjBuilder::numStr( n++ ) );
+                            toadd.erase( e );
+                        }
+                    }
+                }
+
+            }
+            else {
+
+                bool found = false;
+
+                while ( i.more() ) {
+                    BSONElement cur = i.next();
+                    bb.append( cur );
+                    n++;
+                    if ( elt.woCompare( cur , false ) == 0 )
+                        found = true;
+                }
+
+                if ( ! found )
+                    bb.appendAs( elt ,  bb.numStr( n ) );
+
+            }
+
+            bb.done();
+            break;
+        }
+
+
+
+        case PUSH_ALL: {
+            uassert( 10132 ,  "$pushAll can only be applied to an array" , in.type() == Array );
+            uassert( 10133 ,  "$pushAll has to be passed an array" , elt.type() );
+
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+            BSONObjIterator i( in.embeddedObject() );
+            int n=0;
+            while ( i.more() ) {
+                bb.append( i.next() );
+                n++;
+            }
+
+            ms.pushStartSize = n;
+
+            i = BSONObjIterator( elt.embeddedObject() );
+            while ( i.more() ) {
+                bb.appendAs( i.next() , bb.numStr( n++ ) );
+            }
+
+            bb.done();
+            break;
+        }
+
+        case PULL:
+        case PULL_ALL: {
+            uassert( 10134 ,  "$pull/$pullAll can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+            int n = 0;
+
+            BSONObjIterator i( in.embeddedObject() );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                bool allowed = true;
+
+                if ( op == PULL ) {
+                    allowed = ! _pullElementMatch( e );
+                }
+                else {
+                    BSONObjIterator j( elt.embeddedObject() );
+                    while( j.more() ) {
+                        BSONElement arrJ = j.next();
+                        if ( e.woCompare( arrJ, false ) == 0 ) {
+                            allowed = false;
+                            break;
+                        }
+                    }
+                }
+
+                if ( allowed )
+                    bb.appendAs( e , bb.numStr( n++ ) );
+            }
+
+            bb.done();
+            break;
+        }
+
+        case POP: {
+            uassert( 10135 ,  "$pop can only be applied to an array" , in.type() == Array );
+            BSONObjBuilder bb( b.subarrayStart( shortFieldName ) );
+
+            int n = 0;
+
+            BSONObjIterator i( in.embeddedObject() );
+            if ( elt.isNumber() && elt.number() < 0 ) {
+                // pop from front
+                if ( i.more() ) {
+                    i.next();
+                    n++;
+                }
+
+                while( i.more() ) {
+                    bb.appendAs( i.next() , bb.numStr( n - 1 ) );
+                    n++;
+                }
+            }
+            else {
+                // pop from back
+                while( i.more() ) {
+                    n++;
+                    BSONElement arrI = i.next();
+                    if ( i.more() ) {
+                        bb.append( arrI );
+                    }
+                }
+            }
+
+            ms.pushStartSize = n;
+            assert( ms.pushStartSize == in.embeddedObject().nFields() );
+            bb.done();
+            break;
+        }
+
+        case BIT: {
+            uassert( 10136 ,  "$bit needs an array" , elt.type() == Object );
+            uassert( 10137 ,  "$bit can only be applied to numbers" , in.isNumber() );
+            uassert( 10138 ,  "$bit cannot update a value of type double" , in.type() != NumberDouble );
+
+            int x = in.numberInt();
+            long long y = in.numberLong();
+
+            BSONObjIterator it( elt.embeddedObject() );
+            while ( it.more() ) {
+                BSONElement e = it.next();
+                uassert( 10139 ,  "$bit field must be number" , e.isNumber() );
+                if ( str::equals(e.fieldName(), "and") ) {
+                    switch( in.type() ) {
+                    case NumberInt: x = x&e.numberInt(); break;
+                    case NumberLong: y = y&e.numberLong(); break;
+                    default: assert( 0 );
+                    }
+                }
+                else if ( str::equals(e.fieldName(), "or") ) {
+                    switch( in.type() ) {
+                    case NumberInt: x = x|e.numberInt(); break;
+                    case NumberLong: y = y|e.numberLong(); break;
+                    default: assert( 0 );
+                    }
+                }
+                else {
+                    uasserted(9016, str::stream() << "unknown $bit operation: " << e.fieldName());
+                }
+            }
+
+            switch( in.type() ) {
+            case NumberInt: b.append( shortFieldName , x ); break;
+            case NumberLong: b.append( shortFieldName , y ); break;
+            default: assert( 0 );
+            }
+
+            break;
+        }
+
+        case RENAME_FROM: {
+            break;
+        }
+
+        case RENAME_TO: {
+            ms.handleRename( b, shortFieldName );
+            break;
+        }
+
+        default:
+            stringstream ss;
+            ss << "Mod::apply can't handle type: " << op;
+            throw UserException( 9017, ss.str() );
+        }
+    }
+
+    // -1 inside a non-object (non-object could be array)
+    // 0 missing
+    // 1 found
+    int validRenamePath( BSONObj obj, const char *path ) {
+        while( const char *p = strchr( path, '.' ) ) {
+            string left( path, p - path );
+            BSONElement e = obj.getField( left );
+            if ( e.eoo() ) {
+                return 0;
+            }
+            if ( e.type() != Object ) {
+                return -1;
+            }
+            obj = e.embeddedObject();
+            path = p + 1;
+        }
+        return !obj.getField( path ).eoo();
+    }
+
+    auto_ptr<ModSetState> ModSet::prepare(const BSONObj &obj) const {
+        DEBUGUPDATE( "\t start prepare" );
+        auto_ptr<ModSetState> mss( new ModSetState( obj ) );
+
+
+        // Perform this check first, so that we don't leave a partially modified object on uassert.
+        for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); ++i ) {
+            DEBUGUPDATE( "\t\t prepare : " << i->first );
+            ModState& ms = mss->_mods[i->first];
+
+            const Mod& m = i->second;
+            BSONElement e = obj.getFieldDotted(m.fieldName);
+
+            ms.m = &m;
+            ms.old = e;
+
+            if ( m.op == Mod::RENAME_FROM ) {
+                int source = validRenamePath( obj, m.fieldName );
+                uassert( 13489, "$rename source field invalid", source != -1 );
+                if ( source != 1 ) {
+                    ms.dontApply = true;
+                }
+                continue;
+            }
+
+            if ( m.op == Mod::RENAME_TO ) {
+                int source = validRenamePath( obj, m.renameFrom() );
+                if ( source == 1 ) {
+                    int target = validRenamePath( obj, m.fieldName );
+                    uassert( 13490, "$rename target field invalid", target != -1 );
+                    ms.newVal = obj.getFieldDotted( m.renameFrom() );
+                    mss->amIInPlacePossible( false );
+                }
+                else {
+                    ms.dontApply = true;
+                }
+                continue;
+            }
+
+            if ( e.eoo() ) {
+                mss->amIInPlacePossible( m.op == Mod::UNSET );
+                continue;
+            }
+
+            switch( m.op ) {
+            case Mod::INC:
+                uassert( 10140 ,  "Cannot apply $inc modifier to non-number", e.isNumber() || e.eoo() );
+                if ( mss->amIInPlacePossible( e.isNumber() ) ) {
+                    // check more typing info here
+                    if ( m.elt.type() != e.type() ) {
+                        // if i'm incrementing with a double, then the storage has to be a double
+                        mss->amIInPlacePossible( m.elt.type() != NumberDouble );
+                    }
+
+                    // check for overflow
+                    if ( e.type() == NumberInt && e.numberLong() + m.elt.numberLong() > numeric_limits<int>::max() ) {
+                        mss->amIInPlacePossible( false );
+                    }
+                }
+                break;
+
+            case Mod::SET:
+                mss->amIInPlacePossible( m.elt.type() == e.type() &&
+                                         m.elt.valuesize() == e.valuesize() );
+                break;
+
+            case Mod::PUSH:
+            case Mod::PUSH_ALL:
+                uassert( 10141 ,  "Cannot apply $push/$pushAll modifier to non-array", e.type() == Array || e.eoo() );
+                mss->amIInPlacePossible( false );
+                break;
+
+            case Mod::PULL:
+            case Mod::PULL_ALL: {
+                uassert( 10142 ,  "Cannot apply $pull/$pullAll modifier to non-array", e.type() == Array || e.eoo() );
+                BSONObjIterator i( e.embeddedObject() );
+                while( mss->_inPlacePossible && i.more() ) {
+                    BSONElement arrI = i.next();
+                    if ( m.op == Mod::PULL ) {
+                        mss->amIInPlacePossible( ! m._pullElementMatch( arrI ) );
+                    }
+                    else if ( m.op == Mod::PULL_ALL ) {
+                        BSONObjIterator j( m.elt.embeddedObject() );
+                        while( mss->_inPlacePossible && j.moreWithEOO() ) {
+                            BSONElement arrJ = j.next();
+                            if ( arrJ.eoo() )
+                                break;
+                            mss->amIInPlacePossible( arrI.woCompare( arrJ, false ) );
+                        }
+                    }
+                }
+                break;
+            }
+
+            case Mod::POP: {
+                uassert( 10143 ,  "Cannot apply $pop modifier to non-array", e.type() == Array || e.eoo() );
+                mss->amIInPlacePossible( e.embeddedObject().isEmpty() );
+                break;
+            }
+
+            case Mod::ADDTOSET: {
+                uassert( 12591 ,  "Cannot apply $addToSet modifier to non-array", e.type() == Array || e.eoo() );
+
+                BSONObjIterator i( e.embeddedObject() );
+                if ( m.isEach() ) {
+                    BSONElementSet toadd;
+                    m.parseEach( toadd );
+                    while( i.more() ) {
+                        BSONElement arrI = i.next();
+                        toadd.erase( arrI );
+                    }
+                    mss->amIInPlacePossible( toadd.size() == 0 );
+                }
+                else {
+                    bool found = false;
+                    while( i.more() ) {
+                        BSONElement arrI = i.next();
+                        if ( arrI.woCompare( m.elt , false ) == 0 ) {
+                            found = true;
+                            break;
+                        }
+                    }
+                    mss->amIInPlacePossible( found );
+                }
+                break;
+            }
+
+            default:
+                // mods we don't know about shouldn't be done in place
+                mss->amIInPlacePossible( false );
+            }
+        }
+
+        DEBUGUPDATE( "\t mss\n" << mss->toString() << "\t--" );
+
+        return mss;
+    }
+
+    void ModState::appendForOpLog( BSONObjBuilder& b ) const {
+        if ( dontApply ) {
+            return;
+        }
+
+        if ( incType ) {
+            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog inc fieldname: " << m->fieldName << " short:" << m->shortFieldName );
+            BSONObjBuilder bb( b.subobjStart( "$set" ) );
+            appendIncValue( bb , true );
+            bb.done();
+            return;
+        }
+
+        if ( m->op == Mod::RENAME_FROM ) {
+            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_FROM fieldName:" << m->fieldName );
+            BSONObjBuilder bb( b.subobjStart( "$unset" ) );
+            bb.append( m->fieldName, 1 );
+            bb.done();
+            return;
+        }
+
+        if ( m->op == Mod::RENAME_TO ) {
+            DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_TO fieldName:" << m->fieldName );
+            BSONObjBuilder bb( b.subobjStart( "$set" ) );
+            bb.appendAs( newVal, m->fieldName );
+            return;
+        }
+
+        const char * name = fixedOpName ? fixedOpName : Mod::modNames[op()];
+
+        DEBUGUPDATE( "\t\t\t\t\t appendForOpLog name:" << name << " fixed: " << fixed << " fn: " << m->fieldName );
+
+        BSONObjBuilder bb( b.subobjStart( name ) );
+        if ( fixed ) {
+            bb.appendAs( *fixed , m->fieldName );
+        }
+        else {
+            bb.appendAs( m->elt , m->fieldName );
+        }
+        bb.done();
+    }
+
+    string ModState::toString() const {
+        stringstream ss;
+        if ( fixedOpName )
+            ss << " fixedOpName: " << fixedOpName;
+        if ( fixed )
+            ss << " fixed: " << fixed;
+        return ss.str();
+    }
+
+    template< class Builder >
+    void ModState::handleRename( Builder &newObjBuilder, const char *shortFieldName ) {
+        newObjBuilder.appendAs( newVal , shortFieldName );
+        BSONObjBuilder b;
+        b.appendAs( newVal, shortFieldName );
+        assert( _objData.isEmpty() );
+        _objData = b.obj();
+        newVal = _objData.firstElement();
+    }
+
+    void ModSetState::applyModsInPlace( bool isOnDisk ) {
+        // TODO i think this assert means that we can get rid of the isOnDisk param
+        //      and just use isOwned as the determination
+        DEV assert( isOnDisk == ! _obj.isOwned() );
+
+        for ( ModStateHolder::iterator i = _mods.begin(); i != _mods.end(); ++i ) {
+            ModState& m = i->second;
+
+            if ( m.dontApply ) {
+                continue;
+            }
+
+            switch ( m.m->op ) {
+            case Mod::UNSET:
+            case Mod::ADDTOSET:
+            case Mod::RENAME_FROM:
+            case Mod::RENAME_TO:
+                // this should have been handled by prepare
+                break;
+            case Mod::PULL:
+            case Mod::PULL_ALL:
+                // this should have been handled by prepare
+                break;
+            case Mod::POP:
+                assert( m.old.eoo() || ( m.old.isABSONObj() && m.old.Obj().isEmpty() ) );
+                break;
+                // [dm] the BSONElementManipulator statements below are for replication (correct?)
+            case Mod::INC:
+                if ( isOnDisk )
+                    m.m->IncrementMe( m.old );
+                else
+                    m.m->incrementMe( m.old );
+                m.fixedOpName = "$set";
+                m.fixed = &(m.old);
+                break;
+            case Mod::SET:
+                if ( isOnDisk )
+                    BSONElementManipulator( m.old ).ReplaceTypeAndValue( m.m->elt );
+                else
+                    BSONElementManipulator( m.old ).replaceTypeAndValue( m.m->elt );
+                break;
+            default:
+                uassert( 13478 ,  "can't apply mod in place - shouldn't have gotten here" , 0 );
+            }
+        }
+    }
+
+    void ModSet::extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base ) {
+        if ( top.type() != Object ) {
+            fields[ base + top.fieldName() ] = top;
+            return;
+        }
+        BSONObjIterator i( top.embeddedObject() );
+        bool empty = true;
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            extractFields( fields, e, base + top.fieldName() + "." );
+            empty = false;
+        }
+        if ( empty )
+            fields[ base + top.fieldName() ] = top;
+    }
+
+    template< class Builder >
+    void ModSetState::_appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen ) {
+        const char * temp = m.fieldName();
+        temp += root.size();
+        const char * dot = strchr( temp , '.' );
+        if ( dot ) {
+            string nr( m.fieldName() , 0 , 1 + ( dot - m.fieldName() ) );
+            string nf( temp , 0 , dot - temp );
+            if ( onedownseen.count( nf ) )
+                return;
+            onedownseen.insert( nf );
+            BSONObjBuilder bb ( b.subobjStart( nf ) );
+            createNewFromMods( nr , bb , BSONObj() ); // don't infer an array from name
+            bb.done();
+        }
+        else {
+            appendNewFromMod( m , b );
+        }
+
+    }
+
+    template< class Builder >
+    void ModSetState::createNewFromMods( const string& root , Builder& b , const BSONObj &obj ) {
+        DEBUGUPDATE( "\t\t createNewFromMods root: " << root );
+        BSONObjIteratorSorted es( obj );
+        BSONElement e = es.next();
+
+        ModStateHolder::iterator m = _mods.lower_bound( root );
+        StringBuilder buf(root.size() + 2 );
+        buf << root << (char)255;
+        ModStateHolder::iterator mend = _mods.lower_bound( buf.str() );
+
+        set<string> onedownseen;
+
+        while ( e.type() && m != mend ) {
+            string field = root + e.fieldName();
+            FieldCompareResult cmp = compareDottedFieldNames( m->second.m->fieldName , field );
+
+            DEBUGUPDATE( "\t\t\t field:" << field << "\t mod:" << m->second.m->fieldName << "\t cmp:" << cmp << "\t short: " << e.fieldName() );
+
+            switch ( cmp ) {
+
+            case LEFT_SUBFIELD: { // Mod is embedded under this element
+                uassert( 10145 ,  str::stream() << "LEFT_SUBFIELD only supports Object: " << field << " not: " << e.type() , e.type() == Object || e.type() == Array );
+                if ( onedownseen.count( e.fieldName() ) == 0 ) {
+                    onedownseen.insert( e.fieldName() );
+                    if ( e.type() == Object ) {
+                        BSONObjBuilder bb( b.subobjStart( e.fieldName() ) );
+                        stringstream nr; nr << root << e.fieldName() << ".";
+                        createNewFromMods( nr.str() , bb , e.embeddedObject() );
+                        bb.done();
+                    }
+                    else {
+                        BSONArrayBuilder ba( b.subarrayStart( e.fieldName() ) );
+                        stringstream nr; nr << root << e.fieldName() << ".";
+                        createNewFromMods( nr.str() , ba , e.embeddedObject() );
+                        ba.done();
+                    }
+                    // inc both as we handled both
+                    e = es.next();
+                    m++;
+                }
+                else {
+                    // this is a very weird case
+                    // have seen it in production, but can't reproduce
+                    // this assert prevents an inf. loop
+                    // but likely isn't the correct solution
+                    assert(0);
+                }
+                continue;
+            }
+            case LEFT_BEFORE: // Mod on a field that doesn't exist
+                DEBUGUPDATE( "\t\t\t\t creating new field for: " << m->second.m->fieldName );
+                _appendNewFromMods( root , m->second , b , onedownseen );
+                m++;
+                continue;
+            case SAME:
+                DEBUGUPDATE( "\t\t\t\t applying mod on: " << m->second.m->fieldName );
+                m->second.apply( b , e );
+                e = es.next();
+                m++;
+                continue;
+            case RIGHT_BEFORE: // field that doesn't have a MOD
+                DEBUGUPDATE( "\t\t\t\t just copying" );
+                b.append( e ); // if array, ignore field name
+                e = es.next();
+                continue;
+            case RIGHT_SUBFIELD:
+                massert( 10399 ,  "ModSet::createNewFromMods - RIGHT_SUBFIELD should be impossible" , 0 );
+                break;
+            default:
+                massert( 10400 ,  "unhandled case" , 0 );
+            }
+        }
+
+        // finished looping the mods, just adding the rest of the elements
+        while ( e.type() ) {
+            DEBUGUPDATE( "\t\t\t copying: " << e.fieldName() );
+            b.append( e );  // if array, ignore field name
+            e = es.next();
+        }
+
+        // do mods that don't have fields already
+        for ( ; m != mend; m++ ) {
+            DEBUGUPDATE( "\t\t\t\t appending from mod at end: " << m->second.m->fieldName );
+            _appendNewFromMods( root , m->second , b , onedownseen );
+        }
+    }
+
+    BSONObj ModSetState::createNewFromMods() {
+        BSONObjBuilder b( (int)(_obj.objsize() * 1.1) );
+        createNewFromMods( "" , b , _obj );
+        return _newFromMods = b.obj();
+    }
+
+    string ModSetState::toString() const {
+        stringstream ss;
+        for ( ModStateHolder::const_iterator i=_mods.begin(); i!=_mods.end(); ++i ) {
+            ss << "\t\t" << i->first << "\t" << i->second.toString() << "\n";
+        }
+        return ss.str();
+    }
+
+    bool ModSetState::FieldCmp::operator()( const string &l, const string &r ) const {
+        return lexNumCmp( l.c_str(), r.c_str() ) < 0;
+    }
+
+    BSONObj ModSet::createNewFromQuery( const BSONObj& query ) {
+        BSONObj newObj;
+
+        {
+            BSONObjBuilder bb;
+            EmbeddedBuilder eb( &bb );
+            BSONObjIteratorSorted i( query );
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                if ( e.fieldName()[0] == '$' ) // for $atomic and anything else we add
+                    continue;
+
+                if ( e.type() == Object && e.embeddedObject().firstElementFieldName()[0] == '$' ) {
+                    // this means this is a $gt type filter, so don't make part of the new object
+                    continue;
+                }
+
+                eb.appendAs( e , e.fieldName() );
+            }
+            eb.done();
+            newObj = bb.obj();
+        }
+
+        auto_ptr<ModSetState> mss = prepare( newObj );
+
+        if ( mss->canApplyInPlace() )
+            mss->applyModsInPlace( false );
+        else
+            newObj = mss->createNewFromMods();
+
+        return newObj;
+    }
+
+    /* get special operations like $inc
+       { $inc: { a:1, b:1 } }
+       { $set: { a:77 } }
+       { $push: { a:55 } }
+       { $pushAll: { a:[77,88] } }
+       { $pull: { a:66 } }
+       { $pullAll : { a:[99,1010] } }
+       NOTE: MODIFIES source from object!
+    */
+    ModSet::ModSet(
+        const BSONObj &from ,
+        const set<string>& idxKeys,
+        const set<string> *backgroundKeys)
+        : _isIndexed(0) , _hasDynamicArray( false ) {
+
+        BSONObjIterator it(from);
+
+        while ( it.more() ) {
+            BSONElement e = it.next();
+            const char *fn = e.fieldName();
+
+            uassert( 10147 ,  "Invalid modifier specified: " + string( fn ), e.type() == Object );
+            BSONObj j = e.embeddedObject();
+            DEBUGUPDATE( "\t" << j );
+
+            BSONObjIterator jt(j);
+            Mod::Op op = opFromStr( fn );
+
+            while ( jt.more() ) {
+                BSONElement f = jt.next(); // x:44
+
+                const char * fieldName = f.fieldName();
+
+                uassert( 15896 ,  "Modified field name may not start with $", fieldName[0] != '$' || op == Mod::UNSET );  // allow remove of invalid field name in case it was inserted before this check was added (~ version 2.1)
+                uassert( 10148 ,  "Mod on _id not allowed", strcmp( fieldName, "_id" ) != 0 );
+                uassert( 10149 ,  "Invalid mod field name, may not end in a period", fieldName[ strlen( fieldName ) - 1 ] != '.' );
+                uassert( 10150 ,  "Field name duplication not allowed with modifiers", ! haveModForField( fieldName ) );
+                uassert( 10151 ,  "have conflicting mods in update" , ! haveConflictingMod( fieldName ) );
+                uassert( 10152 ,  "Modifier $inc allowed for numbers only", f.isNumber() || op != Mod::INC );
+                uassert( 10153 ,  "Modifier $pushAll/pullAll allowed for arrays only", f.type() == Array || ( op != Mod::PUSH_ALL && op != Mod::PULL_ALL ) );
+
+                if ( op == Mod::RENAME_TO ) {
+                    uassert( 13494, "$rename target must be a string", f.type() == String );
+                    const char *target = f.valuestr();
+                    uassert( 13495, "$rename source must differ from target", strcmp( fieldName, target ) != 0 );
+                    uassert( 13496, "invalid mod field name, source may not be empty", fieldName[0] );
+                    uassert( 13479, "invalid mod field name, target may not be empty", target[0] );
+                    uassert( 13480, "invalid mod field name, source may not begin or end in period", fieldName[0] != '.' && fieldName[ strlen( fieldName ) - 1 ] != '.' );
+                    uassert( 13481, "invalid mod field name, target may not begin or end in period", target[0] != '.' && target[ strlen( target ) - 1 ] != '.' );
+                    uassert( 13482, "$rename affecting _id not allowed", !( fieldName[0] == '_' && fieldName[1] == 'i' && fieldName[2] == 'd' && ( !fieldName[3] || fieldName[3] == '.' ) ) );
+                    uassert( 13483, "$rename affecting _id not allowed", !( target[0] == '_' && target[1] == 'i' && target[2] == 'd' && ( !target[3] || target[3] == '.' ) ) );
+                    uassert( 13484, "field name duplication not allowed with $rename target", !haveModForField( target ) );
+                    uassert( 13485, "conflicting mods not allowed with $rename target", !haveConflictingMod( target ) );
+                    uassert( 13486, "$rename target may not be a parent of source", !( strncmp( fieldName, target, strlen( target ) ) == 0 && fieldName[ strlen( target ) ] == '.' ) );
+                    uassert( 13487, "$rename source may not be dynamic array", strstr( fieldName , ".$" ) == 0 );
+                    uassert( 13488, "$rename target may not be dynamic array", strstr( target , ".$" ) == 0 );
+
+                    Mod from;
+                    from.init( Mod::RENAME_FROM, f );
+                    from.setFieldName( fieldName );
+                    updateIsIndexed( from, idxKeys, backgroundKeys );
+                    _mods[ from.fieldName ] = from;
+
+                    Mod to;
+                    to.init( Mod::RENAME_TO, f );
+                    to.setFieldName( target );
+                    updateIsIndexed( to, idxKeys, backgroundKeys );
+                    _mods[ to.fieldName ] = to;
+
+                    DEBUGUPDATE( "\t\t " << fieldName << "\t" << from.fieldName << "\t" << to.fieldName );
+                    continue;
+                }
+
+                _hasDynamicArray = _hasDynamicArray || strstr( fieldName , ".$" ) > 0;
+
+                Mod m;
+                m.init( op , f );
+                m.setFieldName( f.fieldName() );
+                updateIsIndexed( m, idxKeys, backgroundKeys );
+                _mods[m.fieldName] = m;
+
+                DEBUGUPDATE( "\t\t " << fieldName << "\t" << m.fieldName << "\t" << _hasDynamicArray );
+            }
+        }
+
+    }
+
+    ModSet * ModSet::fixDynamicArray( const char * elemMatchKey ) const {
+        ModSet * n = new ModSet();
+        n->_isIndexed = _isIndexed;
+        n->_hasDynamicArray = _hasDynamicArray;
+        for ( ModHolder::const_iterator i=_mods.begin(); i!=_mods.end(); i++ ) {
+            string s = i->first;
+            size_t idx = s.find( ".$" );
+            if ( idx == string::npos ) {
+                n->_mods[s] = i->second;
+                continue;
+            }
+            StringBuilder buf(s.size()+strlen(elemMatchKey));
+            buf << s.substr(0,idx+1) << elemMatchKey << s.substr(idx+2);
+            string fixed = buf.str();
+            DEBUGUPDATE( "fixed dynamic: " << s << " -->> " << fixed );
+            n->_mods[fixed] = i->second;
+            ModHolder::iterator temp = n->_mods.find( fixed );
+            temp->second.setFieldName( temp->first.c_str() );
+        }
+        return n;
+    }
+
+    void checkNoMods( BSONObj o ) {
+        BSONObjIterator i( o );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            uassert( 10154 ,  "Modifiers and non-modifiers cannot be mixed", e.fieldName()[ 0 ] != '$' );
+        }
+    }
+
+    static void checkTooLarge(const BSONObj& newObj) {
+        uassert( 12522 , "$ operator made object too large" , newObj.objsize() <= BSONObjMaxUserSize );
+    }
+
+    /* note: this is only (as-is) called for
+
+             - not multi
+             - not mods is indexed
+             - not upsert
+    */
+    static UpdateResult _updateById(bool isOperatorUpdate, int idIdxNo, ModSet *mods, int profile, NamespaceDetails *d,
+                                    NamespaceDetailsTransient *nsdt,
+                                    bool god, const char *ns,
+                                    const BSONObj& updateobj, BSONObj patternOrig, bool logop, OpDebug& debug) {
+
+        DiskLoc loc;
+        {
+            IndexDetails& i = d->idx(idIdxNo);
+            BSONObj key = i.getKeyFromQuery( patternOrig );            
+            loc = i.idxInterface().findSingle(i, i.head, key);
+            if( loc.isNull() ) {
+                // no upsert support in _updateById yet, so we are done.
+                return UpdateResult(0, 0, 0);
+            }
+        }
+        Record *r = loc.rec();
+
+        if ( ! r->likelyInPhysicalMemory() ) {
+            {
+                scoped_ptr<LockMongoFilesShared> lk( new LockMongoFilesShared() );
+                dbtempreleasewritelock t;
+                r->touch();
+                lk.reset(0); // we have to release mmmutex before we can re-acquire dbmutex
+            }
+            
+            {
+                // we need to re-find in case something changed
+                d = nsdetails( ns );
+                if ( ! d ) {
+                    // dropped 
+                    return UpdateResult(0, 0, 0);
+                }
+                nsdt = &NamespaceDetailsTransient::get(ns);
+                IndexDetails& i = d->idx(idIdxNo);
+                BSONObj key = i.getKeyFromQuery( patternOrig );            
+                loc = i.idxInterface().findSingle(i, i.head, key);
+                if( loc.isNull() ) {
+                    // no upsert support in _updateById yet, so we are done.
+                    return UpdateResult(0, 0, 0);
+                }
+                
+                r = loc.rec();
+            }
+        }
+
+        /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
+           regular ones at the moment. */
+        if ( isOperatorUpdate ) {
+            const BSONObj& onDisk = loc.obj();
+            auto_ptr<ModSetState> mss = mods->prepare( onDisk );
+
+            if( mss->canApplyInPlace() ) {
+                mss->applyModsInPlace(true);
+                DEBUGUPDATE( "\t\t\t updateById doing in place update" );
+            }
+            else {
+                BSONObj newObj = mss->createNewFromMods();
+                checkTooLarge(newObj);
+                assert(nsdt);
+                theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
+            }
+
+            if ( logop ) {
+                DEV assert( mods->size() );
+
+                BSONObj pattern = patternOrig;
+                if ( mss->haveArrayDepMod() ) {
+                    BSONObjBuilder patternBuilder;
+                    patternBuilder.appendElements( pattern );
+                    mss->appendSizeSpecForArrayDepMods( patternBuilder );
+                    pattern = patternBuilder.obj();
+                }
+
+                if( mss->needOpLogRewrite() ) {
+                    DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() );
+                    logOp("u", ns, mss->getOpLogRewrite() , &pattern );
+                }
+                else {
+                    logOp("u", ns, updateobj, &pattern );
+                }
+            }
+            return UpdateResult( 1 , 1 , 1);
+        } // end $operator update
+
+        // regular update
+        BSONElementManipulator::lookForTimestamps( updateobj );
+        checkNoMods( updateobj );
+        assert(nsdt);
+        theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug );
+        if ( logop ) {
+            logOp("u", ns, updateobj, &patternOrig );
+        }
+        return UpdateResult( 1 , 0 , 1 );
+    }
+
+    UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug, RemoveSaver* rs ) {
+        DEBUGUPDATE( "update: " << ns << " update: " << updateobj << " query: " << patternOrig << " upsert: " << upsert << " multi: " << multi );
+        Client& client = cc();
+        int profile = client.database()->profile;
+        
+        debug.updateobj = updateobj;
+
+        // idea with these here it to make them loop invariant for multi updates, and thus be a bit faster for that case
+        // The pointers may be left invalid on a failed or terminal yield recovery.
+        NamespaceDetails *d = nsdetails(ns); // can be null if an upsert...
+        NamespaceDetailsTransient *nsdt = &NamespaceDetailsTransient::get(ns);
+
+        auto_ptr<ModSet> mods;
+        bool isOperatorUpdate = updateobj.firstElementFieldName()[0] == '$';
+        int modsIsIndexed = false; // really the # of indexes
+        if ( isOperatorUpdate ) {
+            if( d && d->indexBuildInProgress ) {
+                set<string> bgKeys;
+                d->inProgIdx().keyPattern().getFieldNames(bgKeys);
+                mods.reset( new ModSet(updateobj, nsdt->indexKeys(), &bgKeys) );
+            }
+            else {
+                mods.reset( new ModSet(updateobj, nsdt->indexKeys()) );
+            }
+            modsIsIndexed = mods->isIndexed();
+        }
+
+        if( !multi && isSimpleIdQuery(patternOrig) && d && !modsIsIndexed ) {
+            int idxNo = d->findIdIndex();
+            if( idxNo >= 0 ) {
+                debug.idhack = true;
+                UpdateResult result = _updateById(isOperatorUpdate, idxNo, mods.get(), profile, d, nsdt, god, ns, updateobj, patternOrig, logop, debug);
+                if ( result.existing || ! upsert ) {
+                    return result;
+                }
+                else if ( upsert && ! isOperatorUpdate && ! logop) {
+                    // this handles repl inserts
+                    checkNoMods( updateobj );
+                    debug.upsert = true;
+                    BSONObj no = updateobj;
+                    theDataFileMgr.insertWithObjMod(ns, no, god);
+                    return UpdateResult( 0 , 0 , 1 , no );
+                }
+            }
+        }
+
+        int numModded = 0;
+        long long nscanned = 0;
+        shared_ptr< Cursor > c = NamespaceDetailsTransient::getCursor( ns, patternOrig );
+
+        d = nsdetails(ns);
+        nsdt = &NamespaceDetailsTransient::get(ns);
+        bool autoDedup = c->autoDedup();
+
+        if( c->ok() ) {
+            set<DiskLoc> seenObjects;
+            MatchDetails details;
+            auto_ptr<ClientCursor> cc;
+            do {
+                nscanned++;
+
+                bool atomic = c->matcher() && c->matcher()->docMatcher().atomic();
+                
+                if ( !atomic ) {
+                    // *****************
+                    if ( cc.get() == 0 ) {
+                        shared_ptr< Cursor > cPtr = c;
+                        cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+                    }
+    
+                    bool didYield;
+                    if ( ! cc->yieldSometimes( ClientCursor::WillNeed, &didYield ) ) {
+                        cc.release();
+                        break;
+                    }
+                    if ( !c->ok() ) {
+                        break;
+                    }
+                
+                    if ( didYield ) {
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get(ns);
+                    }
+                    // *****************
+                }
+
+                if ( !c->currentMatches( &details ) ) {
+                    c->advance();
+
+                    if ( nscanned % 256 == 0 && ! atomic ) {
+                        if ( cc.get() == 0 ) {
+                            shared_ptr< Cursor > cPtr = c;
+                            cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+                        }
+                        if ( ! cc->yield() ) {
+                            cc.release();
+                            // TODO should we assert or something?
+                            break;
+                        }
+                        if ( !c->ok() ) {
+                            break;
+                        }
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get(ns);
+                    }
+                    continue;
+                }
+
+                Record *r = c->_current();
+                DiskLoc loc = c->currLoc();
+
+                // TODO Maybe this is unnecessary since we have seenObjects
+                if ( c->getsetdup( loc ) && autoDedup ) {
+                    c->advance();
+                    continue;
+                }
+
+                BSONObj js(r);
+
+                BSONObj pattern = patternOrig;
+
+                if ( logop ) {
+                    BSONObjBuilder idPattern;
+                    BSONElement id;
+                    // NOTE: If the matching object lacks an id, we'll log
+                    // with the original pattern.  This isn't replay-safe.
+                    // It might make sense to suppress the log instead
+                    // if there's no id.
+                    if ( js.getObjectID( id ) ) {
+                        idPattern.append( id );
+                        pattern = idPattern.obj();
+                    }
+                    else {
+                        uassert( 10157 ,  "multi-update requires all modified objects to have an _id" , ! multi );
+                    }
+                }
+
+                if ( profile && !multi )
+                    debug.nscanned = (int) nscanned;
+
+                /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
+                    regular ones at the moment. */
+                if ( isOperatorUpdate ) {
+
+                    if ( multi ) {
+                        c->advance(); // go to next record in case this one moves
+                        if ( autoDedup && seenObjects.count( loc ) )
+                            continue;
+                    }
+
+                    const BSONObj& onDisk = loc.obj();
+
+                    ModSet * useMods = mods.get();
+                    bool forceRewrite = false;
+
+                    auto_ptr<ModSet> mymodset;
+                    if ( details._elemMatchKey && mods->hasDynamicArray() ) {
+                        useMods = mods->fixDynamicArray( details._elemMatchKey );
+                        mymodset.reset( useMods );
+                        forceRewrite = true;
+                    }
+
+                    auto_ptr<ModSetState> mss = useMods->prepare( onDisk );
+
+                    bool willAdvanceCursor = multi && c->ok() && ( modsIsIndexed || ! mss->canApplyInPlace() );
+
+                    if ( willAdvanceCursor ) {
+                        if ( cc.get() ) {
+                            cc->setDoingDeletes( true );
+                        }
+                        c->prepareToTouchEarlierIterate();
+                    }
+
+                    if ( modsIsIndexed <= 0 && mss->canApplyInPlace() ) {
+                        mss->applyModsInPlace( true );// const_cast<BSONObj&>(onDisk) );
+
+                        DEBUGUPDATE( "\t\t\t doing in place update" );
+                        if ( profile && !multi ) 
+                            debug.fastmod = true;
+
+                        if ( modsIsIndexed ) {
+                            seenObjects.insert( loc );
+                        }
+
+                        d->paddingFits();
+                    }
+                    else {
+                        if ( rs )
+                            rs->goingToDelete( onDisk );
+
+                        BSONObj newObj = mss->createNewFromMods();
+                        checkTooLarge(newObj);
+                        DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
+                        if ( newLoc != loc || modsIsIndexed ){
+                            // log() << "Moved obj " << newLoc.obj()["_id"] << " from " << loc << " to " << newLoc << endl;
+                            // object moved, need to make sure we don' get again
+                            seenObjects.insert( newLoc );
+                        }
+
+                    }
+
+                    if ( logop ) {
+                        DEV assert( mods->size() );
+
+                        if ( mss->haveArrayDepMod() ) {
+                            BSONObjBuilder patternBuilder;
+                            patternBuilder.appendElements( pattern );
+                            mss->appendSizeSpecForArrayDepMods( patternBuilder );
+                            pattern = patternBuilder.obj();
+                        }
+
+                        if ( forceRewrite || mss->needOpLogRewrite() ) {
+                            DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() );
+                            logOp("u", ns, mss->getOpLogRewrite() , &pattern );
+                        }
+                        else {
+                            logOp("u", ns, updateobj, &pattern );
+                        }
+                    }
+                    numModded++;
+                    if ( ! multi )
+                        return UpdateResult( 1 , 1 , numModded );
+                    if ( willAdvanceCursor )
+                        c->recoverFromTouchingEarlierIterate();
+
+                    if ( nscanned % 64 == 0 && ! atomic ) {
+                        if ( cc.get() == 0 ) {
+                            shared_ptr< Cursor > cPtr = c;
+                            cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) );
+                        }
+                        if ( ! cc->yield() ) {
+                            cc.release();
+                            break;
+                        }
+                        if ( !c->ok() ) {
+                            break;
+                        }
+                        d = nsdetails(ns);
+                        nsdt = &NamespaceDetailsTransient::get(ns);
+                    }
+
+                    getDur().commitIfNeeded();
+
+                    continue;
+                }
+
+                uassert( 10158 ,  "multi update only works with $ operators" , ! multi );
+
+                BSONElementManipulator::lookForTimestamps( updateobj );
+                checkNoMods( updateobj );
+                theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, god);
+                if ( logop ) {
+                    DEV wassert( !god ); // god doesn't get logged, this would be bad.
+                    logOp("u", ns, updateobj, &pattern );
+                }
+                return UpdateResult( 1 , 0 , 1 );
+            } while ( c->ok() );
+        } // endif
+
+        if ( numModded )
+            return UpdateResult( 1 , 1 , numModded );
+
+        // todo: no need for "if( profile )" here as that probably just makes things slower?
+        if ( profile )
+            debug.nscanned = (int) nscanned;
+
+        if ( upsert ) {
+            if ( updateobj.firstElementFieldName()[0] == '$' ) {
+                // upsert of an $operation. build a default object 
+                BSONObj newObj = mods->createNewFromQuery( patternOrig );
+                checkNoMods( newObj );
+                debug.fastmodinsert = true;
+                theDataFileMgr.insertWithObjMod(ns, newObj, god);
+                if ( logop )
+                    logOp( "i", ns, newObj );
+
+                return UpdateResult( 0 , 1 , 1 , newObj );
+            }
+            uassert( 10159 ,  "multi update only works with $ operators" , ! multi );
+            checkNoMods( updateobj );
+            debug.upsert = true;
+            BSONObj no = updateobj;
+            theDataFileMgr.insertWithObjMod(ns, no, god);
+            if ( logop )
+                logOp( "i", ns, no );
+            return UpdateResult( 0 , 0 , 1 , no );
+        }
+
+        return UpdateResult( 0 , isOperatorUpdate , 0 );
+    }
+
+    UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug ) {
+        uassert( 10155 , "cannot update reserved $ collection", strchr(ns, '$') == 0 );
+        if ( strstr(ns, ".system.") ) {
+            /* dm: it's very important that system.indexes is never updated as IndexDetails has pointers into it */
+            uassert( 10156 , str::stream() << "cannot update system collection: " << ns << " q: " << patternOrig << " u: " << updateobj , legalClientSystemNS( ns , true ) );
+        }
+        return _updateObjects(false, ns, updateobj, patternOrig, upsert, multi, logop, debug);
+    }
+
+}
diff --git a/src/mongo/db/ops/update.h b/src/mongo/db/ops/update.h
new file mode 100644
index 00000000000..9446db06d36
--- /dev/null
+++ b/src/mongo/db/ops/update.h
@@ -0,0 +1,700 @@
+// update.h
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "../../util/embedded_builder.h"
+#include "../matcher.h"
+
+namespace mongo {
+
+    // ---------- public -------------
+
+    struct UpdateResult {
+        const bool existing; // if existing objects were modified
+        const bool mod;      // was this a $ mod
+        const long long num; // how many objects touched
+        OID upserted;  // if something was upserted, the new _id of the object
+
+        UpdateResult( bool e, bool m, unsigned long long n , const BSONObj& upsertedObject = BSONObj() )
+            : existing(e) , mod(m), num(n) {
+            upserted.clear();
+            BSONElement id = upsertedObject["_id"];
+            if ( ! e && n == 1 && id.type() == jstOID ) {
+                upserted = id.OID();
+            }
+        }
+    };
+
+    class RemoveSaver;
+
+    /* returns true if an existing object was updated, false if no existing object was found.
+       multi - update multiple objects - mostly useful with things like $set
+       god - allow access to system namespaces
+    */
+    UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug );
+    UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj pattern,
+                                bool upsert, bool multi , bool logop , OpDebug& debug , RemoveSaver * rs = 0 );
+
+
+
+    // ---------- private -------------
+
+    class ModState;
+    class ModSetState;
+
+    /* Used for modifiers such as $inc, $set, $push, ...
+     * stores the info about a single operation
+     * once created should never be modified
+     */
+    struct Mod {
+        // See opFromStr below
+        //        0    1    2     3         4     5          6    7      8       9       10    11        12           13
+        enum Op { INC, SET, PUSH, PUSH_ALL, PULL, PULL_ALL , POP, UNSET, BITAND, BITOR , BIT , ADDTOSET, RENAME_FROM, RENAME_TO } op;
+
+        static const char* modNames[];
+        static unsigned modNamesNum;
+
+        const char *fieldName;
+        const char *shortFieldName;
+
+        BSONElement elt; // x:5 note: this is the actual element from the updateobj
+        boost::shared_ptr<Matcher> matcher;
+        bool matcherOnPrimitive;
+
+        void init( Op o , BSONElement& e ) {
+            op = o;
+            elt = e;
+            if ( op == PULL && e.type() == Object ) {
+                BSONObj t = e.embeddedObject();
+                if ( t.firstElement().getGtLtOp() == 0 ) {
+                    matcher.reset( new Matcher( t ) );
+                    matcherOnPrimitive = false;
+                }
+                else {
+                    matcher.reset( new Matcher( BSON( "" << t ) ) );
+                    matcherOnPrimitive = true;
+                }
+            }
+        }
+
+        void setFieldName( const char * s ) {
+            fieldName = s;
+            shortFieldName = strrchr( fieldName , '.' );
+            if ( shortFieldName )
+                shortFieldName++;
+            else
+                shortFieldName = fieldName;
+        }
+
+        /**
+         * @param in incrememnts the actual value inside in
+         */
+        void incrementMe( BSONElement& in ) const {
+            BSONElementManipulator manip( in );
+            switch ( in.type() ) {
+            case NumberDouble:
+                manip.setNumber( elt.numberDouble() + in.numberDouble() );
+                break;
+            case NumberLong:
+                manip.setLong( elt.numberLong() + in.numberLong() );
+                break;
+            case NumberInt:
+                manip.setInt( elt.numberInt() + in.numberInt() );
+                break;
+            default:
+                assert(0);
+            }
+        }
+        void IncrementMe( BSONElement& in ) const {
+            BSONElementManipulator manip( in );
+            switch ( in.type() ) {
+            case NumberDouble:
+                manip.SetNumber( elt.numberDouble() + in.numberDouble() );
+                break;
+            case NumberLong:
+                manip.SetLong( elt.numberLong() + in.numberLong() );
+                break;
+            case NumberInt:
+                manip.SetInt( elt.numberInt() + in.numberInt() );
+                break;
+            default:
+                assert(0);
+            }
+        }
+
+        template< class Builder >
+        void appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const;
+
+        bool operator<( const Mod &other ) const {
+            return strcmp( fieldName, other.fieldName ) < 0;
+        }
+
+        bool arrayDep() const {
+            switch (op) {
+            case PUSH:
+            case PUSH_ALL:
+            case POP:
+                return true;
+            default:
+                return false;
+            }
+        }
+
+        static bool isIndexed( const string& fullName , const set<string>& idxKeys ) {
+            const char * fieldName = fullName.c_str();
+            // check if there is an index key that is a parent of mod
+            for( const char *dot = strchr( fieldName, '.' ); dot; dot = strchr( dot + 1, '.' ) )
+                if ( idxKeys.count( string( fieldName, dot - fieldName ) ) )
+                    return true;
+
+            // check if there is an index key equal to mod
+            if ( idxKeys.count(fullName) )
+                return true;
+            // check if there is an index key that is a child of mod
+            set< string >::const_iterator j = idxKeys.upper_bound( fullName );
+            if ( j != idxKeys.end() && j->find( fullName ) == 0 && (*j)[fullName.size()] == '.' )
+                return true;
+
+            return false;
+        }
+
+        bool isIndexed( const set<string>& idxKeys ) const {
+            string fullName = fieldName;
+
+            if ( isIndexed( fullName , idxKeys ) )
+                return true;
+
+            if ( strstr( fieldName , "." ) ) {
+                // check for a.0.1
+                StringBuilder buf( fullName.size() + 1 );
+                for ( size_t i=0; i<fullName.size(); i++ ) {
+                    char c = fullName[i];
+
+                    if ( c == '$' &&
+                            i > 0 && fullName[i-1] == '.' &&
+                            i+1<fullName.size() &&
+                            fullName[i+1] == '.' ) {
+                        i++;
+                        continue;
+                    }
+
+                    buf << c;
+
+                    if ( c != '.' )
+                        continue;
+
+                    if ( ! isdigit( fullName[i+1] ) )
+                        continue;
+
+                    bool possible = true;
+                    size_t j=i+2;
+                    for ( ; j<fullName.size(); j++ ) {
+                        char d = fullName[j];
+                        if ( d == '.' )
+                            break;
+                        if ( isdigit( d ) )
+                            continue;
+                        possible = false;
+                        break;
+                    }
+
+                    if ( possible )
+                        i = j;
+                }
+                string x = buf.str();
+                if ( isIndexed( x , idxKeys ) )
+                    return true;
+            }
+
+            return false;
+        }
+
+        template< class Builder >
+        void apply( Builder& b , BSONElement in , ModState& ms ) const;
+
+        /**
+         * @return true iff toMatch should be removed from the array
+         */
+        bool _pullElementMatch( BSONElement& toMatch ) const;
+
+        void _checkForAppending( const BSONElement& e ) const {
+            if ( e.type() == Object ) {
+                // this is a tiny bit slow, but rare and important
+                // only when setting something TO an object, not setting something in an object
+                // and it checks for { $set : { x : { 'a.b' : 1 } } }
+                // which is feel has been common
+                uassert( 12527 , "not okForStorage" , e.embeddedObject().okForStorage() );
+            }
+        }
+
+        bool isEach() const {
+            if ( elt.type() != Object )
+                return false;
+            BSONElement e = elt.embeddedObject().firstElement();
+            if ( e.type() != Array )
+                return false;
+            return strcmp( e.fieldName() , "$each" ) == 0;
+        }
+
+        BSONObj getEach() const {
+            return elt.embeddedObjectUserCheck().firstElement().embeddedObjectUserCheck();
+        }
+
+        void parseEach( BSONElementSet& s ) const {
+            BSONObjIterator i(getEach());
+            while ( i.more() ) {
+                s.insert( i.next() );
+            }
+        }
+
+        const char *renameFrom() const {
+            massert( 13492, "mod must be RENAME_TO type", op == Mod::RENAME_TO );
+            return elt.fieldName();
+        }
+    };
+
+    /**
+     * stores a set of Mods
+     * once created, should never be changed
+     */
+    class ModSet : boost::noncopyable {
+        typedef map<string,Mod> ModHolder;
+        ModHolder _mods;
+        int _isIndexed;
+        bool _hasDynamicArray;
+
+        static void extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base );
+
+        FieldCompareResult compare( const ModHolder::iterator &m, map< string, BSONElement >::iterator &p, const map< string, BSONElement >::iterator &pEnd ) const {
+            bool mDone = ( m == _mods.end() );
+            bool pDone = ( p == pEnd );
+            assert( ! mDone );
+            assert( ! pDone );
+            if ( mDone && pDone )
+                return SAME;
+            // If one iterator is done we want to read from the other one, so say the other one is lower.
+            if ( mDone )
+                return RIGHT_BEFORE;
+            if ( pDone )
+                return LEFT_BEFORE;
+
+            return compareDottedFieldNames( m->first, p->first.c_str() );
+        }
+
+        bool mayAddEmbedded( map< string, BSONElement > &existing, string right ) {
+            for( string left = EmbeddedBuilder::splitDot( right );
+                    left.length() > 0 && left[ left.length() - 1 ] != '.';
+                    left += "." + EmbeddedBuilder::splitDot( right ) ) {
+                if ( existing.count( left ) > 0 && existing[ left ].type() != Object )
+                    return false;
+                if ( haveModForField( left.c_str() ) )
+                    return false;
+            }
+            return true;
+        }
+        static Mod::Op opFromStr( const char *fn ) {
+            assert( fn[0] == '$' );
+            switch( fn[1] ) {
+            case 'i': {
+                if ( fn[2] == 'n' && fn[3] == 'c' && fn[4] == 0 )
+                    return Mod::INC;
+                break;
+            }
+            case 's': {
+                if ( fn[2] == 'e' && fn[3] == 't' && fn[4] == 0 )
+                    return Mod::SET;
+                break;
+            }
+            case 'p': {
+                if ( fn[2] == 'u' ) {
+                    if ( fn[3] == 's' && fn[4] == 'h' ) {
+                        if ( fn[5] == 0 )
+                            return Mod::PUSH;
+                        if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 )
+                            return Mod::PUSH_ALL;
+                    }
+                    else if ( fn[3] == 'l' && fn[4] == 'l' ) {
+                        if ( fn[5] == 0 )
+                            return Mod::PULL;
+                        if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 )
+                            return Mod::PULL_ALL;
+                    }
+                }
+                else if ( fn[2] == 'o' && fn[3] == 'p' && fn[4] == 0 )
+                    return Mod::POP;
+                break;
+            }
+            case 'u': {
+                if ( fn[2] == 'n' && fn[3] == 's' && fn[4] == 'e' && fn[5] == 't' && fn[6] == 0 )
+                    return Mod::UNSET;
+                break;
+            }
+            case 'b': {
+                if ( fn[2] == 'i' && fn[3] == 't' ) {
+                    if ( fn[4] == 0 )
+                        return Mod::BIT;
+                    if ( fn[4] == 'a' && fn[5] == 'n' && fn[6] == 'd' && fn[7] == 0 )
+                        return Mod::BITAND;
+                    if ( fn[4] == 'o' && fn[5] == 'r' && fn[6] == 0 )
+                        return Mod::BITOR;
+                }
+                break;
+            }
+            case 'a': {
+                if ( fn[2] == 'd' && fn[3] == 'd' ) {
+                    // add
+                    if ( fn[4] == 'T' && fn[5] == 'o' && fn[6] == 'S' && fn[7] == 'e' && fn[8] == 't' && fn[9] == 0 )
+                        return Mod::ADDTOSET;
+
+                }
+                break;
+            }
+            case 'r': {
+                if ( fn[2] == 'e' && fn[3] == 'n' && fn[4] == 'a' && fn[5] == 'm' && fn[6] =='e' ) {
+                    return Mod::RENAME_TO; // with this return code we handle both RENAME_TO and RENAME_FROM
+                }
+                break;
+            }
+            default: break;
+            }
+            uassert( 10161 ,  "Invalid modifier specified " + string( fn ), false );
+            return Mod::INC;
+        }
+
+        ModSet() {}
+
+        void updateIsIndexed( const Mod &m, const set<string> &idxKeys, const set<string> *backgroundKeys ) {
+            if ( m.isIndexed( idxKeys ) ||
+                    (backgroundKeys && m.isIndexed(*backgroundKeys)) ) {
+                _isIndexed++;
+            }
+        }
+
+    public:
+
+        ModSet( const BSONObj &from ,
+                const set<string>& idxKeys = set<string>(),
+                const set<string>* backgroundKeys = 0
+              );
+
+        // TODO: this is inefficient - should probably just handle when iterating
+        ModSet * fixDynamicArray( const char * elemMatchKey ) const;
+
+        bool hasDynamicArray() const { return _hasDynamicArray; }
+
+        /**
+         * creates a ModSetState suitable for operation on obj
+         * doesn't change or modify this ModSet or any underying Mod
+         */
+        auto_ptr<ModSetState> prepare( const BSONObj& obj ) const;
+
+        /**
+         * given a query pattern, builds an object suitable for an upsert
+         * will take the query spec and combine all $ operators
+         */
+        BSONObj createNewFromQuery( const BSONObj& query );
+
+        /**
+         *
+         */
+        int isIndexed() const {
+            return _isIndexed;
+        }
+
+        unsigned size() const { return _mods.size(); }
+
+        bool haveModForField( const char *fieldName ) const {
+            return _mods.find( fieldName ) != _mods.end();
+        }
+
+        bool haveConflictingMod( const string& fieldName ) {
+            size_t idx = fieldName.find( '.' );
+            if ( idx == string::npos )
+                idx = fieldName.size();
+
+            ModHolder::const_iterator start = _mods.lower_bound(fieldName.substr(0,idx));
+            for ( ; start != _mods.end(); start++ ) {
+                FieldCompareResult r = compareDottedFieldNames( fieldName , start->first );
+                switch ( r ) {
+                case LEFT_SUBFIELD: return true;
+                case LEFT_BEFORE: return false;
+                case SAME: return true;
+                case RIGHT_BEFORE: return false;
+                case RIGHT_SUBFIELD: return true;
+                }
+            }
+            return false;
+
+
+        }
+
+    };
+
+    /**
+     * stores any information about a single Mod operating on a single Object
+     */
+    class ModState {
+    public:
+        const Mod * m;
+        BSONElement old;
+        BSONElement newVal;
+        BSONObj _objData;
+
+        const char * fixedOpName;
+        BSONElement * fixed;
+        int pushStartSize;
+
+        BSONType incType;
+        int incint;
+        double incdouble;
+        long long inclong;
+
+        bool dontApply;
+
+        ModState() {
+            fixedOpName = 0;
+            fixed = 0;
+            pushStartSize = -1;
+            incType = EOO;
+            dontApply = false;
+        }
+
+        Mod::Op op() const {
+            return m->op;
+        }
+
+        const char * fieldName() const {
+            return m->fieldName;
+        }
+
+        bool needOpLogRewrite() const {
+            if ( dontApply )
+                return false;
+
+            if ( fixed || fixedOpName || incType )
+                return true;
+
+            switch( op() ) {
+            case Mod::RENAME_FROM:
+            case Mod::RENAME_TO:
+                return true;
+            case Mod::BIT:
+            case Mod::BITAND:
+            case Mod::BITOR:
+                // TODO: should we convert this to $set?
+                return false;
+            default:
+                return false;
+            }
+        }
+
+        void appendForOpLog( BSONObjBuilder& b ) const;
+
+        template< class Builder >
+        void apply( Builder& b , BSONElement in ) {
+            m->apply( b , in , *this );
+        }
+
+        template< class Builder >
+        void appendIncValue( Builder& b , bool useFullName ) const {
+            const char * n = useFullName ? m->fieldName : m->shortFieldName;
+
+            switch ( incType ) {
+            case NumberDouble:
+                b.append( n , incdouble ); break;
+            case NumberLong:
+                b.append( n , inclong ); break;
+            case NumberInt:
+                b.append( n , incint ); break;
+            default:
+                assert(0);
+            }
+        }
+
+        string toString() const;
+
+        template< class Builder >
+        void handleRename( Builder &newObjBuilder, const char *shortFieldName );
+    };
+
+    /**
+     * this is used to hold state, meta data while applying a ModSet to a BSONObj
+     * the goal is to make ModSet const so its re-usable
+     */
+    class ModSetState : boost::noncopyable {
+        struct FieldCmp {
+            bool operator()( const string &l, const string &r ) const;
+        };
+        typedef map<string,ModState,FieldCmp> ModStateHolder;
+        const BSONObj& _obj;
+        ModStateHolder _mods;
+        bool _inPlacePossible;
+        BSONObj _newFromMods; // keep this data alive, as oplog generation may depend on it
+
+        ModSetState( const BSONObj& obj )
+            : _obj( obj ) , _inPlacePossible(true) {
+        }
+
+        /**
+         * @return if in place is still possible
+         */
+        bool amIInPlacePossible( bool inPlacePossible ) {
+            if ( ! inPlacePossible )
+                _inPlacePossible = false;
+            return _inPlacePossible;
+        }
+
+        template< class Builder >
+        void createNewFromMods( const string& root , Builder& b , const BSONObj &obj );
+
+        template< class Builder >
+        void _appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen );
+
+        template< class Builder >
+        void appendNewFromMod( ModState& ms , Builder& b ) {
+            if ( ms.dontApply ) {
+                return;
+            }
+
+            //const Mod& m = *(ms.m); // HACK
+            Mod& m = *((Mod*)(ms.m)); // HACK
+
+            switch ( m.op ) {
+
+            case Mod::PUSH: {
+                if ( m.isEach() ) {
+                    b.appendArray( m.shortFieldName, m.getEach() );
+                } else {
+                    BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) );
+                    arr.appendAs( m.elt, "0" );
+                    arr.done();
+                }
+                break;
+            }
+            case Mod::ADDTOSET: {
+                if ( m.isEach() ) {
+                    // Remove any duplicates in given array
+                    BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) );
+                    BSONElementSet toadd;
+                    m.parseEach( toadd );
+                    BSONObjIterator i( m.getEach() );
+                    int n = 0;
+                    while ( i.more() ) {
+                        BSONElement e = i.next();
+                        if ( toadd.count(e) ) {
+                            arr.appendAs( e , BSONObjBuilder::numStr( n++ ) );
+                            toadd.erase( e );
+                        }
+                    }
+                    arr.done();
+                }
+                else {
+                    BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) );
+                    arr.appendAs( m.elt, "0" );
+                    arr.done();
+                }
+                break;
+            }
+
+            case Mod::PUSH_ALL: {
+                b.appendAs( m.elt, m.shortFieldName );
+                break;
+            }
+
+            case Mod::UNSET:
+            case Mod::PULL:
+            case Mod::PULL_ALL:
+                // no-op b/c unset/pull of nothing does nothing
+                break;
+
+            case Mod::INC:
+                ms.fixedOpName = "$set";
+            case Mod::SET: {
+                m._checkForAppending( m.elt );
+                b.appendAs( m.elt, m.shortFieldName );
+                break;
+            }
+            // shouldn't see RENAME_FROM here
+            case Mod::RENAME_TO:
+                ms.handleRename( b, m.shortFieldName );
+                break;
+            default:
+                stringstream ss;
+                ss << "unknown mod in appendNewFromMod: " << m.op;
+                throw UserException( 9015, ss.str() );
+            }
+
+        }
+
+    public:
+
+        bool canApplyInPlace() const {
+            return _inPlacePossible;
+        }
+
+        /**
+         * modified underlying _obj
+         * @param isOnDisk - true means this is an on disk object, and this update needs to be made durable
+         */
+        void applyModsInPlace( bool isOnDisk );
+
+        BSONObj createNewFromMods();
+
+        // re-writing for oplog
+
+        bool needOpLogRewrite() const {
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+                if ( i->second.needOpLogRewrite() )
+                    return true;
+            return false;
+        }
+
+        BSONObj getOpLogRewrite() const {
+            BSONObjBuilder b;
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+                i->second.appendForOpLog( b );
+            return b.obj();
+        }
+
+        bool haveArrayDepMod() const {
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ )
+                if ( i->second.m->arrayDep() )
+                    return true;
+            return false;
+        }
+
+        void appendSizeSpecForArrayDepMods( BSONObjBuilder &b ) const {
+            for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) {
+                const ModState& m = i->second;
+                if ( m.m->arrayDep() ) {
+                    if ( m.pushStartSize == -1 )
+                        b.appendNull( m.fieldName() );
+                    else
+                        b << m.fieldName() << BSON( "$size" << m.pushStartSize );
+                }
+            }
+        }
+
+        string toString() const;
+
+        friend class ModSet;
+    };
+
+}
+
diff --git a/src/mongo/db/pagefault.cpp b/src/mongo/db/pagefault.cpp
new file mode 100644
index 00000000000..4b9b1b23e02
--- /dev/null
+++ b/src/mongo/db/pagefault.cpp
@@ -0,0 +1,55 @@
+// @file pagefault.cpp
+
+#include "pch.h"
+#include "diskloc.h"
+#include "pagefault.h"
+#include "client.h"
+#include "pdfile.h"
+#include "server.h"
+
+namespace mongo { 
+
+    PageFaultException::PageFaultException(Record *_r)
+    {
+        assert( cc()._pageFaultRetryableSection != 0 );
+        cc()._pageFaultRetryableSection->_laps++;
+        assert( cc()._pageFaultRetryableSection->_laps < 1000 );
+        r = _r;
+        era = LockMongoFilesShared::getEra();
+    }
+
+    void PageFaultException::touch() { 
+        assert( !d.dbMutex.atLeastReadLocked() );
+        LockMongoFilesShared lk;
+        if( LockMongoFilesShared::getEra() != era ) {
+            // files opened and closed.  we don't try to handle but just bail out; this is much simpler
+            // and less error prone and saves us from taking a dbmutex readlock.
+            dlog(2) << "era changed" << endl;
+            return;
+        }
+        r->touch();
+    }
+
+    PageFaultRetryableSection::~PageFaultRetryableSection() {
+        cc()._pageFaultRetryableSection = old;
+    }
+    PageFaultRetryableSection::PageFaultRetryableSection() {
+        _laps = 0;
+        old = cc()._pageFaultRetryableSection;
+        if( d.dbMutex.atLeastReadLocked() ) { 
+            cc()._pageFaultRetryableSection = 0;
+            if( debug || logLevel > 2 ) { 
+                LOGSOME << "info PageFaultRetryableSection will not yield, already locked upon reaching" << endl;
+            }
+        }
+        else if( cc()._pageFaultRetryableSection ) { 
+            cc()._pageFaultRetryableSection = 0;
+            dlog(2) << "info nested PageFaultRetryableSection will not yield on fault" << endl;
+        }
+        else {
+            cc()._pageFaultRetryableSection = this;
+            cc()._hasWrittenThisPass = false;
+        }
+    }
+
+}
diff --git a/src/mongo/db/pagefault.h b/src/mongo/db/pagefault.h
new file mode 100644
index 00000000000..8bbf4ecab52
--- /dev/null
+++ b/src/mongo/db/pagefault.h
@@ -0,0 +1,46 @@
+// @file pagefault.h
+
+// define this : _PAGEFAULTEXCEPTION
+
+#pragma once
+
+namespace mongo {
+
+    class Record;
+
+    class PageFaultException /*: public DBException*/ { 
+        unsigned era;
+        Record *r;
+    public:
+        PageFaultException(const PageFaultException& rhs) : era(rhs.era), r(rhs.r) { }
+        explicit PageFaultException(Record*);
+        void touch();
+    };
+
+    class PageFaultRetryableSection : boost::noncopyable { 
+        PageFaultRetryableSection *old;
+    public:
+        unsigned _laps;
+        PageFaultRetryableSection();
+        ~PageFaultRetryableSection();
+    };
+#if 0
+    inline void how_to_use_example() {
+        // ...
+        {
+            PageFaultRetryableSection s;
+            while( 1 ) {
+                try {
+                    writelock lk; // or readlock
+                    // do work
+                    break;
+                }
+                catch( PageFaultException& e ) { 
+                    e.touch();
+                } 
+            }
+        }
+        // ...
+    }
+#endif
+}
diff --git a/src/mongo/db/pcre.txt b/src/mongo/db/pcre.txt
new file mode 100644
index 00000000000..3e21047eabc
--- /dev/null
+++ b/src/mongo/db/pcre.txt
@@ -0,0 +1,15 @@
+
+
+You need to install pcre.
+
+This could be scripted:
+
+cd /tmp
+curl -O ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-7.4.tar.gz
+tar -xzf pcre-7.4.tar.gz 
+./configure --enable-utf8 --with-match-limit=200000 --with-match-limit-recursion=4000
+make
+make install
+
+
+At that point is will be installed in /usr/*.  the version in p/pcre-7.4 is for VC++.
diff --git a/src/mongo/db/pdfile.cpp b/src/mongo/db/pdfile.cpp
new file mode 100644
index 00000000000..069eeadec37
--- /dev/null
+++ b/src/mongo/db/pdfile.cpp
@@ -0,0 +1,2425 @@
+// pdfile.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+todo:
+_ table scans must be sequential, not next/prev pointers
+_ coalesce deleted
+_ disallow system* manipulations from the database.
+*/
+
+#include "pch.h"
+#include "pdfile.h"
+#include "db.h"
+#include "../util/mmap.h"
+#include "../util/hashtab.h"
+#include "../util/file_allocator.h"
+#include "../util/processinfo.h"
+#include "../util/file.h"
+#include "btree.h"
+#include "btreebuilder.h"
+#include <algorithm>
+#include <list>
+#include "repl.h"
+#include "dbhelpers.h"
+#include "namespace-inl.h"
+#include "queryutil.h"
+#include "extsort.h"
+#include "curop-inl.h"
+#include "background.h"
+#include "compact.h"
+#include "ops/delete.h"
+#include "instance.h"
+#include "replutil.h"
+
+namespace mongo {
+
+    BOOST_STATIC_ASSERT( sizeof(Extent)-4 == 48+128 );
+    BOOST_STATIC_ASSERT( sizeof(DataFileHeader)-4 == 8192 );
+
+    void printMemInfo( const char * where ) {
+        cout << "mem info: ";
+        if ( where )
+            cout << where << " ";
+        ProcessInfo pi;
+        if ( ! pi.supported() ) {
+            cout << " not supported" << endl;
+            return;
+        }
+
+        cout << "vsize: " << pi.getVirtualMemorySize() << " resident: " << pi.getResidentSize() << " mapped: " << ( MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) ) << endl;
+    }
+
+    bool isValidNS( const StringData& ns ) {
+        // TODO: should check for invalid characters
+
+        const char * x = strchr( ns.data() , '.' );
+        if ( ! x )
+            return false;
+
+        x++;
+        return *x > 0;
+    }
+
+    bool inDBRepair = false;
+    struct doingRepair {
+        doingRepair() {
+            assert( ! inDBRepair );
+            inDBRepair = true;
+        }
+        ~doingRepair() {
+            inDBRepair = false;
+        }
+    };
+
+    map<string, unsigned> BackgroundOperation::dbsInProg;
+    set<string> BackgroundOperation::nsInProg;
+
+    bool BackgroundOperation::inProgForDb(const char *db) {
+        assertInWriteLock();
+        return dbsInProg[db] != 0;
+    }
+
+    bool BackgroundOperation::inProgForNs(const char *ns) {
+        assertInWriteLock();
+        return nsInProg.count(ns) != 0;
+    }
+
+    void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) {
+        uassert(12586, "cannot perform operation: a background operation is currently running for this database",
+                !inProgForDb(db));
+    }
+
+    void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) {
+        uassert(12587, "cannot perform operation: a background operation is currently running for this collection",
+                !inProgForNs(ns));
+    }
+
+    BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) {
+        assertInWriteLock();
+        dbsInProg[_ns.db]++;
+        assert( nsInProg.count(_ns.ns()) == 0 );
+        nsInProg.insert(_ns.ns());
+    }
+
+    BackgroundOperation::~BackgroundOperation() {
+        wassert( d.dbMutex.isWriteLocked() );
+        dbsInProg[_ns.db]--;
+        nsInProg.erase(_ns.ns());
+    }
+
+    void BackgroundOperation::dump(stringstream& ss) {
+        if( nsInProg.size() ) {
+            ss << "\n<b>Background Jobs in Progress</b>\n";
+            for( set<string>::iterator i = nsInProg.begin(); i != nsInProg.end(); i++ )
+                ss << "  " << *i << '\n';
+        }
+        for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) {
+            if( i->second )
+                ss << "database " << i->first << ": " << i->second << '\n';
+        }
+    }
+
+    /* ----------------------------------------- */
+
+    string dbpath = "/data/db/";
+    const char FREELIST_NS[] = ".$freelist";
+    bool directoryperdb = false;
+    string repairpath;
+    string pidfilepath;
+
+    DataFileMgr theDataFileMgr;
+    DatabaseHolder _dbHolder;
+    int MAGIC = 0x1000;
+
+    DatabaseHolder& dbHolderUnchecked() {
+        return _dbHolder;
+    }
+
+    void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0);
+    void ensureIdIndexForNewNs(const char *ns) {
+        if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) &&
+                strstr( ns, FREELIST_NS ) == 0 ) {
+            log( 1 ) << "adding _id index for collection " << ns << endl;
+            ensureHaveIdIndex( ns );
+        }
+    }
+
+    string getDbContext() {
+        stringstream ss;
+        Client * c = currentClient.get();
+        if ( c ) {
+            Client::Context * cx = c->getContext();
+            if ( cx ) {
+                Database *database = cx->db();
+                if ( database ) {
+                    ss << database->name << ' ';
+                    ss << cx->ns() << ' ';
+                }
+            }
+        }
+        return ss.str();
+    }
+
+    /*---------------------------------------------------------------------*/
+
+    // inheritable class to implement an operation that may be applied to all
+    // files in a database using _applyOpToDataFiles()
+    class FileOp {
+    public:
+        virtual ~FileOp() {}
+        // Return true if file exists and operation successful
+        virtual bool apply( const boost::filesystem::path &p ) = 0;
+        virtual const char * op() const = 0;
+    };
+
+    void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath );
+
+    void _deleteDataFiles(const char *database) {
+        if ( directoryperdb ) {
+            FileAllocator::get()->waitUntilFinished();
+            MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ), "delete data files with a directoryperdb" );
+            return;
+        }
+        class : public FileOp {
+            virtual bool apply( const boost::filesystem::path &p ) {
+                return boost::filesystem::remove( p );
+            }
+            virtual const char * op() const {
+                return "remove";
+            }
+        } deleter;
+        _applyOpToDataFiles( database, deleter, true );
+    }
+
+    int Extent::initialSize(int len) {
+        long long sz = len * 16;
+        if ( len < 1000 ) sz = len * 64;
+        if ( sz > 1000000000 )
+            sz = 1000000000;
+        int z = ((int)sz) & 0xffffff00;
+        assert( z > len );
+        return z;
+    }
+
+    bool _userCreateNS(const char *ns, const BSONObj& options, string& err, bool *deferIdIndex) {
+        if ( nsdetails(ns) ) {
+            err = "collection already exists";
+            return false;
+        }
+
+        log(1) << "create collection " << ns << ' ' << options << endl;
+
+        /* todo: do this only when we have allocated space successfully? or we could insert with a { ok: 0 } field
+           and then go back and set to ok : 1 after we are done.
+        */
+        bool isFreeList = strstr(ns, FREELIST_NS) != 0;
+        if( !isFreeList )
+            addNewNamespaceToCatalog(ns, options.isEmpty() ? 0 : &options);
+
+        long long size = Extent::initialSize(128);
+        {
+            BSONElement e = options.getField("size");
+            if ( e.isNumber() ) {
+                size = e.numberLong();
+                size += 256;
+                size &= 0xffffffffffffff00LL;
+            }
+        }
+
+        uassert( 10083 , "create collection invalid size spec", size > 0 );
+
+        bool newCapped = false;
+        int mx = 0;
+        if( options["capped"].trueValue() ) {
+            newCapped = true;
+            BSONElement e = options.getField("max");
+            if ( e.isNumber() ) {
+                mx = e.numberInt();
+            }
+        }
+
+        // $nExtents just for debug/testing.
+        BSONElement e = options.getField( "$nExtents" );
+        Database *database = cc().database();
+        if ( e.type() == Array ) {
+            // We create one extent per array entry, with size specified
+            // by the array value.
+            BSONObjIterator i( e.embeddedObject() );
+            while( i.more() ) {
+                BSONElement e = i.next();
+                int size = int( e.number() );
+                assert( size <= 0x7fffffff );
+                // $nExtents is just for testing - always allocate new extents
+                // rather than reuse existing extents so we have some predictibility
+                // in the extent size used by our tests
+                database->suitableFile( ns, (int) size, false, false )->createExtent( ns, (int) size, newCapped );
+            }
+        }
+        else if ( int( e.number() ) > 0 ) {
+            // We create '$nExtents' extents, each of size 'size'.
+            int nExtents = int( e.number() );
+            assert( size <= 0x7fffffff );
+            for ( int i = 0; i < nExtents; ++i ) {
+                assert( size <= 0x7fffffff );
+                // $nExtents is just for testing - always allocate new extents
+                // rather than reuse existing extents so we have some predictibility
+                // in the extent size used by our tests
+                database->suitableFile( ns, (int) size, false, false )->createExtent( ns, (int) size, newCapped );
+            }
+        }
+        else {
+            // This is the non test case, where we don't have a $nExtents spec.
+            while ( size > 0 ) {
+                int max = MongoDataFile::maxSize() - DataFileHeader::HeaderSize;
+                int desiredExtentSize = (int) (size > max ? max : size);
+                if ( desiredExtentSize < Extent::minSize() ) {
+                    desiredExtentSize = Extent::minSize();
+                }
+                desiredExtentSize &= 0xffffff00;
+                Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped, true );
+                size -= e->length;
+            }
+        }
+
+        NamespaceDetails *d = nsdetails(ns);
+        assert(d);
+
+        bool ensure = false;
+        if ( options.getField( "autoIndexId" ).type() ) {
+            if ( options["autoIndexId"].trueValue() ) {
+                ensure = true;
+            }
+        }
+        else {
+            if ( !newCapped ) {
+                ensure=true;
+            }
+        }
+        if( ensure ) {
+            if( deferIdIndex )
+                *deferIdIndex = true;
+            else
+                ensureIdIndexForNewNs( ns );
+        }
+
+        if ( mx > 0 )
+            getDur().writingInt( d->max ) = mx;
+
+        return true;
+    }
+
+    /** { ..., capped: true, size: ..., max: ... }
+        @param deferIdIndex - if not not, defers id index creation.  sets the bool value to true if we wanted to create the id index.
+        @return true if successful
+    */
+    bool userCreateNS(const char *ns, BSONObj options, string& err, bool logForReplication, bool *deferIdIndex) {
+        const char *coll = strchr( ns, '.' ) + 1;
+        massert( 10356 ,  str::stream() << "invalid ns: " << ns , NamespaceString::validCollectionName(ns));
+        char cl[ 256 ];
+        nsToDatabase( ns, cl );
+        bool ok = _userCreateNS(ns, options, err, deferIdIndex);
+        if ( logForReplication && ok ) {
+            if ( options.getField( "create" ).eoo() ) {
+                BSONObjBuilder b;
+                b << "create" << coll;
+                b.appendElements( options );
+                options = b.obj();
+            }
+            string logNs = string( cl ) + ".$cmd";
+            logOp("c", logNs.c_str(), options);
+        }
+        return ok;
+    }
+
+    /*---------------------------------------------------------------------*/
+
+    int MongoDataFile::maxSize() {
+        if ( sizeof( int* ) == 4 ) {
+            return 512 * 1024 * 1024;
+        }
+        else if ( cmdLine.smallfiles ) {
+            return 0x7ff00000 >> 2;
+        }
+        else {
+            return 0x7ff00000;
+        }
+    }
+
+    NOINLINE_DECL void MongoDataFile::badOfs2(int ofs) const {
+        stringstream ss;
+        ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
+        uasserted(13441, ss.str());
+    }
+
+    NOINLINE_DECL void MongoDataFile::badOfs(int ofs) const {
+        stringstream ss;
+        ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
+        uasserted(13440, ss.str());
+    }
+
+    int MongoDataFile::defaultSize( const char *filename ) const {
+        int size;
+        if ( fileNo <= 4 )
+            size = (64*1024*1024) << fileNo;
+        else
+            size = 0x7ff00000;
+        if ( cmdLine.smallfiles ) {
+            size = size >> 2;
+        }
+        return size;
+    }
+
+    static void check(void *_mb) { 
+        if( sizeof(char *) == 4 )
+            uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", _mb != 0);
+        else
+            uassert( 10085 , "can't map file memory", _mb != 0);
+    }
+
+    /** @return true if found and opened. if uninitialized (prealloc only) does not open. */
+    bool MongoDataFile::openExisting( const char *filename ) {
+        assert( _mb == 0 );
+        if( !exists(filename) )
+            return false;
+        if( !mmf.open(filename,false) ) {
+            dlog(2) << "info couldn't open " << filename << " probably end of datafile list" << endl;
+            return false;
+        }
+        _mb = mmf.getView(); assert(_mb);
+        unsigned long long sz = mmf.length();
+        assert( sz <= 0x7fffffff );
+        assert( sz % 4096 == 0 );
+        if( sz < 64*1024*1024 && !cmdLine.smallfiles ) { 
+            if( sz >= 16*1024*1024 && sz % (1024*1024) == 0 ) { 
+                log() << "info openExisting file size " << sz << " but cmdLine.smallfiles=false" << endl;
+            }
+            else {
+                log() << "openExisting size " << sz << " less then minimum file size expectation " << filename << endl;
+                assert(false);
+            }
+        }
+        check(_mb);
+        if( header()->uninitialized() )
+            return false;
+        return true;
+    }
+
+    void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) {
+        long size = defaultSize( filename );
+        while ( size < minSize ) {
+            if ( size < maxSize() / 2 )
+                size *= 2;
+            else {
+                size = maxSize();
+                break;
+            }
+        }
+        if ( size > maxSize() )
+            size = maxSize();
+
+        assert( size >= 64*1024*1024 || cmdLine.smallfiles );
+        assert( size % 4096 == 0 );
+
+        if ( preallocateOnly ) {
+            if ( cmdLine.prealloc ) {
+                FileAllocator::get()->requestAllocation( filename, size );
+            }
+            return;
+        }
+
+        {
+            assert( _mb == 0 );
+            unsigned long long sz = size;
+            if( mmf.create(filename, sz, false) )
+                _mb = mmf.getView();
+            assert( sz <= 0x7fffffff );
+            size = (int) sz;
+        }
+        check(_mb);
+        header()->init(fileNo, size, filename);
+    }
+
+    void MongoDataFile::flush( bool sync ) {
+        mmf.flush( sync );
+    }
+
+    void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) {
+        NamespaceIndex *ni = nsindex(ns);
+        NamespaceDetails *details = ni->details(ns);
+        if ( details ) {
+            assert( !details->lastExtent.isNull() );
+            assert( !details->firstExtent.isNull() );
+            getDur().writingDiskLoc(e->xprev) = details->lastExtent;
+            getDur().writingDiskLoc(details->lastExtent.ext()->xnext) = eloc;
+            assert( !eloc.isNull() );
+            getDur().writingDiskLoc(details->lastExtent) = eloc;
+        }
+        else {
+            ni->add_ns(ns, eloc, capped);
+            details = ni->details(ns);
+        }
+
+        {
+            NamespaceDetails *dw = details->writingWithoutExtra();
+            dw->lastExtentSize = e->length;
+        }
+        details->addDeletedRec(emptyLoc.drec(), emptyLoc);
+    }
+
+    Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) {
+        {
+            // make sizes align with VM page size
+            int newSize = (approxSize + 0xfff) & 0xfffff000;
+            assert( newSize >= 0 );
+            if( newSize < Extent::maxSize() )
+                approxSize = newSize;
+        }
+        massert( 10357 ,  "shutdown in progress", ! inShutdown() );
+        massert( 10358 ,  "bad new extent size", approxSize >= Extent::minSize() && approxSize <= Extent::maxSize() );
+        massert( 10359 ,  "header==0 on new extent: 32 bit mmap space exceeded?", header() ); // null if file open failed
+        int ExtentSize = min(header()->unusedLength, approxSize);
+        DiskLoc loc;
+        if ( ExtentSize < Extent::minSize() ) {
+            /* note there could be a lot of looping here is db just started and
+               no files are open yet.  we might want to do something about that. */
+            if ( loops > 8 ) {
+                assert( loops < 10000 );
+                out() << "warning: loops=" << loops << " fileno:" << fileNo << ' ' << ns << '\n';
+            }
+            log() << "newExtent: " << ns << " file " << fileNo << " full, adding a new file\n";
+            return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1);
+        }
+        int offset = header()->unused.getOfs();
+
+        DataFileHeader *h = header();
+        h->unused.writing().set( fileNo, offset + ExtentSize );
+        getDur().writingInt(h->unusedLength) = h->unusedLength - ExtentSize;
+        loc.set(fileNo, offset);
+        Extent *e = _getExtent(loc);
+        DiskLoc emptyLoc = getDur().writing(e)->init(ns, ExtentSize, fileNo, offset, newCapped);
+
+        addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped);
+
+        DEV tlog(1) << "new extent " << ns << " size: 0x" << hex << ExtentSize << " loc: 0x" << hex << offset
+                    << " emptyLoc:" << hex << emptyLoc.getOfs() << dec << endl;
+        return e;
+    }
+
+    Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) {
+        string s = cc().database()->name + FREELIST_NS;
+        NamespaceDetails *f = nsdetails(s.c_str());
+        if( f ) {
+            int low, high;
+            if( capped ) {
+                // be strict about the size
+                low = approxSize;
+                if( low > 2048 ) low -= 256;
+                high = (int) (approxSize * 1.05) + 256;
+            }
+            else {
+                low = (int) (approxSize * 0.8);
+                high = (int) (approxSize * 1.4);
+            }
+            if( high <= 0 ) {
+                // overflowed
+                high = max(approxSize, Extent::maxSize());
+            }
+            int n = 0;
+            Extent *best = 0;
+            int bestDiff = 0x7fffffff;
+            {
+                Timer t;
+                DiskLoc L = f->firstExtent;
+                while( !L.isNull() ) {
+                    Extent * e = L.ext();
+                    if( e->length >= low && e->length <= high ) {
+                        int diff = abs(e->length - approxSize);
+                        if( diff < bestDiff ) {
+                            bestDiff = diff;
+                            best = e;
+                            if( ((double) diff) / approxSize < 0.1 ) { 
+                                // close enough
+                                break;
+                            }
+                            if( t.seconds() >= 2 ) { 
+                                // have spent lots of time in write lock, and we are in [low,high], so close enough
+                                // could come into play if extent freelist is very long
+                                break;
+                            }
+                        }
+                        else { 
+                            OCCASIONALLY {
+                                if( high < 64 * 1024 && t.seconds() >= 2 ) {
+                                    // be less picky if it is taking a long time
+                                    high = 64 * 1024;
+                                }
+                            }
+                        }
+                    }
+                    L = e->xnext;
+                    ++n;
+                }
+                if( t.seconds() >= 10 ) {
+                    log() << "warning: slow scan in allocFromFreeList (in write lock)" << endl;
+                }
+            }
+
+            if( n > 128 ) log( n < 512 ) << "warning: newExtent " << n << " scanned\n";
+
+            if( best ) {
+                Extent *e = best;
+                // remove from the free list
+                if( !e->xprev.isNull() )
+                    e->xprev.ext()->xnext.writing() = e->xnext;
+                if( !e->xnext.isNull() )
+                    e->xnext.ext()->xprev.writing() = e->xprev;
+                if( f->firstExtent == e->myLoc )
+                    f->firstExtent.writing() = e->xnext;
+                if( f->lastExtent == e->myLoc )
+                    f->lastExtent.writing() = e->xprev;
+
+                // use it
+                OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
+                DiskLoc emptyLoc = e->reuse(ns, capped);
+                addNewExtentToNamespace(ns, e, e->myLoc, emptyLoc, capped);
+                return e;
+            }
+        }
+
+        return 0;
+        //        return createExtent(ns, approxSize, capped);
+    }
+
+    /*---------------------------------------------------------------------*/
+
+    void Extent::markEmpty() { 
+        xnext.Null();
+        xprev.Null();
+        firstRecord.Null();
+        lastRecord.Null();
+    }
+
+    DiskLoc Extent::reuse(const char *nsname, bool capped) {
+        return getDur().writing(this)->_reuse(nsname, capped);
+    }
+
+    void getEmptyLoc(const char *ns, const DiskLoc extentLoc, int extentLength, bool capped, /*out*/DiskLoc& emptyLoc, /*out*/int& delRecLength) { 
+        emptyLoc = extentLoc;
+        emptyLoc.inc( Extent::HeaderSize() );
+        delRecLength = extentLength - Extent::HeaderSize();
+        if( delRecLength >= 32*1024 && str::contains(ns, '$') && !capped ) { 
+            // probably an index. so skip forward to keep its records page aligned 
+            int& ofs = emptyLoc.GETOFS();
+            int newOfs = (ofs + 0xfff) & ~0xfff; 
+            delRecLength -= (newOfs-ofs);
+            dassert( delRecLength > 0 );
+            ofs = newOfs;
+        }
+    }
+
+    DiskLoc Extent::_reuse(const char *nsname, bool capped) {
+        LOG(3) << "reset extent was:" << nsDiagnostic.toString() << " now:" << nsname << '\n';
+        massert( 10360 ,  "Extent::reset bad magic value", magic == 0x41424344 );
+        nsDiagnostic = nsname;
+        markEmpty();
+
+        DiskLoc emptyLoc;
+        int delRecLength;
+        getEmptyLoc(nsname, myLoc, length, capped, emptyLoc, delRecLength);
+
+        // todo: some dup code here and below in Extent::init
+        DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);
+        empty = getDur().writing(empty);
+        empty->lengthWithHeaders = delRecLength;
+        empty->extentOfs = myLoc.getOfs();
+        empty->nextDeleted.Null();
+
+        return emptyLoc;
+    }
+
+    /* assumes already zeroed -- insufficient for block 'reuse' perhaps */
+    DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset, bool capped) {
+        magic = 0x41424344;
+        myLoc.set(_fileNo, _offset);
+        xnext.Null();
+        xprev.Null();
+        nsDiagnostic = nsname;
+        length = _length;
+        firstRecord.Null();
+        lastRecord.Null();
+
+        DiskLoc emptyLoc;
+        int delRecLength;
+        getEmptyLoc(nsname, myLoc, _length, capped, emptyLoc, delRecLength);
+
+        DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength) );
+        empty->lengthWithHeaders = delRecLength;
+        empty->extentOfs = myLoc.getOfs();
+
+        return emptyLoc;
+    }
+
+    /*
+      Record* Extent::newRecord(int len) {
+      if( firstEmptyRegion.isNull() )8
+      return 0;
+
+      assert(len > 0);
+      int newRecSize = len + Record::HeaderSize;
+      DiskLoc newRecordLoc = firstEmptyRegion;
+      Record *r = getRecord(newRecordLoc);
+      int left = r->netLength() - len;
+      if( left < 0 ) {
+      //
+      firstEmptyRegion.Null();
+      return 0;
+      }
+
+      DiskLoc nextEmpty = r->next.getNextEmpty(firstEmptyRegion);
+      r->lengthWithHeaders = newRecSize;
+      r->next.markAsFirstOrLastInExtent(this); // we're now last in the extent
+      if( !lastRecord.isNull() ) {
+      assert(getRecord(lastRecord)->next.lastInExtent()); // it was the last one
+      getRecord(lastRecord)->next.set(newRecordLoc); // until now
+      r->prev.set(lastRecord);
+      }
+      else {
+      r->prev.markAsFirstOrLastInExtent(this); // we are the first in the extent
+      assert( firstRecord.isNull() );
+      firstRecord = newRecordLoc;
+      }
+      lastRecord = newRecordLoc;
+
+      if( left < Record::HeaderSize + 32 ) {
+      firstEmptyRegion.Null();
+      }
+      else {
+      firstEmptyRegion.inc(newRecSize);
+      Record *empty = getRecord(firstEmptyRegion);
+      empty->next.set(nextEmpty); // not for empty records, unless in-use records, next and prev can be null.
+      empty->prev.Null();
+      empty->lengthWithHeaders = left;
+      }
+
+      return r;
+      }
+    */
+
+    int Extent::maxSize() {
+        int maxExtentSize = 0x7ff00000;
+        if ( cmdLine.smallfiles ) {
+            maxExtentSize >>= 2;
+        }
+        return maxExtentSize;
+    }
+
+    /*---------------------------------------------------------------------*/
+
+    shared_ptr<Cursor> DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) {
+        NamespaceDetails * d = nsdetails( ns );
+        if ( ! d )
+            return shared_ptr<Cursor>(new BasicCursor(DiskLoc()));
+
+        DiskLoc loc = d->firstExtent;
+        Extent *e = getExtent(loc);
+
+        DEBUGGING {
+            out() << "listing extents for " << ns << endl;
+            DiskLoc tmp = loc;
+            set<DiskLoc> extents;
+
+            while ( 1 ) {
+                Extent *f = getExtent(tmp);
+                out() << "extent: " << tmp.toString() << endl;
+                extents.insert(tmp);
+                tmp = f->xnext;
+                if ( tmp.isNull() )
+                    break;
+                f = f->getNextExtent();
+            }
+
+            out() << endl;
+            d->dumpDeleted(&extents);
+        }
+
+        if ( d->capped )
+            return shared_ptr<Cursor>( new ForwardCappedCursor( d , startLoc ) );
+
+        if ( !startLoc.isNull() )
+            return shared_ptr<Cursor>(new BasicCursor( startLoc ));
+
+        while ( e->firstRecord.isNull() && !e->xnext.isNull() ) {
+            /* todo: if extent is empty, free it for reuse elsewhere.
+               that is a bit complicated have to clean up the freelists.
+            */
+            RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead. ns:" << ns << endl;
+            // find a nonempty extent
+            // it might be nice to free the whole extent here!  but have to clean up free recs then.
+            e = e->getNextExtent();
+        }
+        return shared_ptr<Cursor>(new BasicCursor( e->firstRecord ));
+    }
+
+    /* get a table scan cursor, but can be forward or reverse direction.
+       order.$natural - if set, > 0 means forward (asc), < 0 backward (desc).
+    */
+    shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc) {
+        BSONElement el = order.getField("$natural"); // e.g., { $natural : -1 }
+
+        if ( el.number() >= 0 )
+            return DataFileMgr::findAll(ns, startLoc);
+
+        // "reverse natural order"
+        NamespaceDetails *d = nsdetails(ns);
+
+        if ( !d )
+            return shared_ptr<Cursor>(new BasicCursor(DiskLoc()));
+
+        if ( !d->capped ) {
+            if ( !startLoc.isNull() )
+                return shared_ptr<Cursor>(new ReverseCursor( startLoc ));
+            Extent *e = d->lastExtent.ext();
+            while ( e->lastRecord.isNull() && !e->xprev.isNull() ) {
+                OCCASIONALLY out() << "  findTableScan: extent empty, skipping ahead" << endl;
+                e = e->getPrevExtent();
+            }
+            return shared_ptr<Cursor>(new ReverseCursor( e->lastRecord ));
+        }
+        else {
+            return shared_ptr<Cursor>( new ReverseCappedCursor( d, startLoc ) );
+        }
+    }
+
+    void printFreeList() {
+        string s = cc().database()->name + FREELIST_NS;
+        log() << "dump freelist " << s << endl;
+        NamespaceDetails *freeExtents = nsdetails(s.c_str());
+        if( freeExtents == 0 ) {
+            log() << "  freeExtents==0" << endl;
+            return;
+        }
+        DiskLoc a = freeExtents->firstExtent;
+        while( !a.isNull() ) {
+            Extent *e = a.ext();
+            log() << "  extent " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << endl;
+            a = e->xnext;
+        }
+
+        log() << "end freelist" << endl;
+    }
+
+    /** free a list of extents that are no longer in use.  this is a double linked list of extents 
+        (could be just one in the list)
+    */
+    void freeExtents(DiskLoc firstExt, DiskLoc lastExt) {
+        {
+            assert( !firstExt.isNull() && !lastExt.isNull() );
+            Extent *f = firstExt.ext();
+            Extent *l = lastExt.ext();
+            assert( f->xprev.isNull() );
+            assert( l->xnext.isNull() );
+            assert( f==l || !f->xnext.isNull() );
+            assert( f==l || !l->xprev.isNull() );
+        }
+
+        string s = cc().database()->name + FREELIST_NS;
+        NamespaceDetails *freeExtents = nsdetails(s.c_str());
+        if( freeExtents == 0 ) {
+            string err;
+            _userCreateNS(s.c_str(), BSONObj(), err, 0); // todo: this actually allocates an extent, which is bad!
+            freeExtents = nsdetails(s.c_str());
+            massert( 10361 , "can't create .$freelist", freeExtents);
+        }
+        if( freeExtents->firstExtent.isNull() ) {
+            freeExtents->firstExtent.writing() = firstExt;
+            freeExtents->lastExtent.writing() = lastExt;
+        }
+        else {
+            DiskLoc a = freeExtents->firstExtent;
+            assert( a.ext()->xprev.isNull() );
+            getDur().writingDiskLoc( a.ext()->xprev ) = lastExt;
+            getDur().writingDiskLoc( lastExt.ext()->xnext ) = a;
+            getDur().writingDiskLoc( freeExtents->firstExtent ) = firstExt;
+        }
+
+        //printFreeList();
+    }
+
+    /* drop a collection/namespace */
+    void dropNS(const string& nsToDrop) {
+        NamespaceDetails* d = nsdetails(nsToDrop.c_str());
+        uassert( 10086 ,  (string)"ns not found: " + nsToDrop , d );
+
+        BackgroundOperation::assertNoBgOpInProgForNs(nsToDrop.c_str());
+
+        NamespaceString s(nsToDrop);
+        assert( s.db == cc().database()->name );
+        if( s.isSystem() ) {
+            if( s.coll == "system.profile" )
+                uassert( 10087 ,  "turn off profiling before dropping system.profile collection", cc().database()->profile == 0 );
+            else
+                uasserted( 12502, "can't drop system ns" );
+        }
+
+        {
+            // remove from the system catalog
+            BSONObj cond = BSON( "name" << nsToDrop );   // { name: "colltodropname" }
+            string system_namespaces = cc().database()->name + ".system.namespaces";
+            /*int n = */ deleteObjects(system_namespaces.c_str(), cond, false, false, true);
+            // no check of return code as this ns won't exist for some of the new storage engines
+        }
+
+        // free extents
+        if( !d->firstExtent.isNull() ) {
+            freeExtents(d->firstExtent, d->lastExtent);
+            getDur().writingDiskLoc( d->firstExtent ).setInvalid();
+            getDur().writingDiskLoc( d->lastExtent ).setInvalid();
+        }
+
+        // remove from the catalog hashtable
+        cc().database()->namespaceIndex.kill_ns(nsToDrop.c_str());
+    }
+
+    void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ) {
+        log(1) << "dropCollection: " << name << endl;
+        NamespaceDetails *d = nsdetails(name.c_str());
+        if( d == 0 )
+            return;
+
+        BackgroundOperation::assertNoBgOpInProgForNs(name.c_str());
+
+        if ( d->nIndexes != 0 ) {
+            try {
+                assert( dropIndexes(d, name.c_str(), "*", errmsg, result, true) );
+            }
+            catch( DBException& e ) {
+                stringstream ss;
+                ss << "drop: dropIndexes for collection failed - consider trying repair ";
+                ss << " cause: " << e.what();
+                uasserted(12503,ss.str());
+            }
+            assert( d->nIndexes == 0 );
+        }
+        log(1) << "\t dropIndexes done" << endl;
+        result.append("ns", name.c_str());
+        ClientCursor::invalidate(name.c_str());
+        Top::global.collectionDropped( name );
+        NamespaceDetailsTransient::eraseForPrefix( name.c_str() );
+        dropNS(name);
+    }
+
+    /* unindex all keys in index for this record. */
+    static void _unindexRecord(IndexDetails& id, BSONObj& obj, const DiskLoc& dl, bool logMissing = true) {
+        BSONObjSet keys;
+        id.getKeysFromObject(obj, keys);
+        IndexInterface& ii = id.idxInterface();
+        for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
+            BSONObj j = *i;
+
+            bool ok = false;
+            try {
+                ok = ii.unindex(id.head, id, j, dl);
+            }
+            catch (AssertionException& e) {
+                problem() << "Assertion failure: _unindex failed " << id.indexNamespace() << endl;
+                out() << "Assertion failure: _unindex failed: " << e.what() << '\n';
+                out() << "  obj:" << obj.toString() << '\n';
+                out() << "  key:" << j.toString() << '\n';
+                out() << "  dl:" << dl.toString() << endl;
+                sayDbContext();
+            }
+
+            if ( !ok && logMissing ) {
+                log() << "unindex failed (key too big?) " << id.indexNamespace() << " key: " << j << " " << obj["_id"] << endl;
+            }
+        }
+    }
+//zzz
+    /* unindex all keys in all indexes for this record. */
+    static void unindexRecord(NamespaceDetails *d, Record *todelete, const DiskLoc& dl, bool noWarn = false) {
+        BSONObj obj(todelete);
+        int n = d->nIndexes;
+        for ( int i = 0; i < n; i++ )
+            _unindexRecord(d->idx(i), obj, dl, !noWarn);
+        if( d->indexBuildInProgress ) { // background index
+            // always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it
+            _unindexRecord(d->idx(n), obj, dl, false);
+        }
+    }
+
+    /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc.
+       caller must check if capped
+    */
+    void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) {
+        /* remove ourself from the record next/prev chain */
+        {
+            if ( todelete->prevOfs != DiskLoc::NullOfs )
+                getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs ) = todelete->nextOfs;
+            if ( todelete->nextOfs != DiskLoc::NullOfs )
+                getDur().writingInt( todelete->getNext(dl).rec()->prevOfs ) = todelete->prevOfs;
+        }
+
+        /* remove ourself from extent pointers */
+        {
+            Extent *e = getDur().writing( todelete->myExtent(dl) );
+            if ( e->firstRecord == dl ) {
+                if ( todelete->nextOfs == DiskLoc::NullOfs )
+                    e->firstRecord.Null();
+                else
+                    e->firstRecord.set(dl.a(), todelete->nextOfs);
+            }
+            if ( e->lastRecord == dl ) {
+                if ( todelete->prevOfs == DiskLoc::NullOfs )
+                    e->lastRecord.Null();
+                else
+                    e->lastRecord.set(dl.a(), todelete->prevOfs);
+            }
+        }
+
+        /* add to the free list */
+        {
+            {
+                NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+                s->datasize -= todelete->netLength();
+                s->nrecords--;
+            }
+
+            if ( strstr(ns, ".system.indexes") ) {
+                /* temp: if in system.indexes, don't reuse, and zero out: we want to be
+                   careful until validated more, as IndexDetails has pointers
+                   to this disk location.  so an incorrectly done remove would cause
+                   a lot of problems.
+                */
+                memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders), 0, todelete->lengthWithHeaders);
+            }
+            else {
+                DEV {
+                    unsigned long long *p = (unsigned long long *) todelete->data;
+                    *getDur().writing(p) = 0;
+                    //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
+                }
+                d->addDeletedRec((DeletedRecord*)todelete, dl);
+            }
+        }
+    }
+
+    void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn, bool doLog ) {
+        dassert( todelete == dl.rec() );
+
+        NamespaceDetails* d = nsdetails(ns);
+        if ( d->capped && !cappedOK ) {
+            out() << "failing remove on a capped ns " << ns << endl;
+            uassert( 10089 ,  "can't remove from a capped collection" , 0 );
+            return;
+        }
+        
+        BSONObj toDelete;
+        if ( doLog ) {
+            BSONElement e = dl.obj()["_id"];
+            if ( e.type() ) {
+                toDelete = e.wrap();
+            }
+        }
+
+        /* check if any cursors point to us.  if so, advance them. */
+        ClientCursor::aboutToDelete(dl);
+
+        unindexRecord(d, todelete, dl, noWarn);
+
+        _deleteRecord(d, ns, todelete, dl);
+        NamespaceDetailsTransient::get( ns ).notifyOfWriteOp();
+
+        if ( ! toDelete.isEmpty() ) {
+            logOp( "d" , ns , toDelete );
+        }
+    }
+
+
+    /** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record.
+     */
+    const DiskLoc DataFileMgr::updateRecord(
+        const char *ns,
+        NamespaceDetails *d,
+        NamespaceDetailsTransient *nsdt,
+        Record *toupdate, const DiskLoc& dl,
+        const char *_buf, int _len, OpDebug& debug,  bool god) {
+
+        dassert( toupdate == dl.rec() );
+
+        BSONObj objOld(toupdate);
+        BSONObj objNew(_buf);
+        DEV assert( objNew.objsize() == _len );
+        DEV assert( objNew.objdata() == _buf );
+
+        if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) {
+            /* add back the old _id value if the update removes it.  Note this implementation is slow
+               (copies entire object multiple times), but this shouldn't happen often, so going for simple
+               code, not speed.
+            */
+            BSONObjBuilder b;
+            BSONElement e;
+            assert( objOld.getObjectID(e) );
+            b.append(e); // put _id first, for best performance
+            b.appendElements(objNew);
+            objNew = b.obj();
+        }
+
+        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
+           below.  that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
+        */
+        vector<IndexChanges> changes;
+        bool changedId = false;
+        getIndexChanges(changes, *d, objNew, objOld, changedId);
+        uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew , ! changedId );
+        dupCheck(changes, *d, dl);
+
+        if ( toupdate->netLength() < objNew.objsize() ) {
+            // doesn't fit.  reallocate -----------------------------------------------------
+            uassert( 10003 , "failing update: objects in a capped ns cannot grow", !(d && d->capped));
+            d->paddingTooSmall();
+            debug.moved = true;
+            deleteRecord(ns, toupdate, dl);
+            return insert(ns, objNew.objdata(), objNew.objsize(), god);
+        }
+
+        nsdt->notifyOfWriteOp();
+        d->paddingFits();
+
+        /* have any index keys changed? */
+        {
+            int keyUpdates = 0;
+            int z = d->nIndexesBeingBuilt();
+            for ( int x = 0; x < z; x++ ) {
+                IndexDetails& idx = d->idx(x);
+                IndexInterface& ii = idx.idxInterface();
+                for ( unsigned i = 0; i < changes[x].removed.size(); i++ ) {
+                    try {
+                        bool found = ii.unindex(idx.head, idx, *changes[x].removed[i], dl);
+                        if ( ! found ) {
+                            RARELY warning() << "ns: " << ns << " couldn't unindex key: " << *changes[x].removed[i] 
+                                             << " for doc: " << objOld["_id"] << endl;
+                        }
+                    }
+                    catch (AssertionException&) {
+                        debug.extra << " exception update unindex ";
+                        problem() << " caught assertion update unindex " << idx.indexNamespace() << endl;
+                    }
+                }
+                assert( !dl.isNull() );
+                BSONObj idxKey = idx.info.obj().getObjectField("key");
+                Ordering ordering = Ordering::make(idxKey);
+                keyUpdates += changes[x].added.size();
+                for ( unsigned i = 0; i < changes[x].added.size(); i++ ) {
+                    try {
+                        /* we did the dupCheck() above.  so we don't have to worry about it here. */
+                        ii.bt_insert(
+                            idx.head,
+                            dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx);
+                    }
+                    catch (AssertionException& e) {
+                        debug.extra << " exception update index ";
+                        problem() << " caught assertion update index " << idx.indexNamespace() << " " << e << " " << objNew["_id"] << endl;
+                    }
+                }
+            }
+            
+            debug.keyUpdates = keyUpdates;
+        }
+
+        //  update in place
+        int sz = objNew.objsize();
+        memcpy(getDur().writingPtr(toupdate->data, sz), objNew.objdata(), sz);
+        return dl;
+    }
+
+    int Extent::followupSize(int len, int lastExtentLen) {
+        assert( len < Extent::maxSize() );
+        int x = initialSize(len);
+        // changed from 1.20 to 1.35 in v2.1.x to get to larger extent size faster
+        int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.35);
+        int sz = y > x ? y : x;
+
+        if ( sz < lastExtentLen ) {
+            // this means there was an int overflow
+            // so we should turn it into maxSize
+            sz = Extent::maxSize();
+        }
+        else if ( sz > Extent::maxSize() ) {
+            sz = Extent::maxSize();
+        }
+
+        sz = ((int)sz) & 0xffffff00;
+        assert( sz > len );
+
+        return sz;
+    }
+
+    /* step one of adding keys to index idxNo for a new record 
+       @return true means done.  false means multikey involved and more work to do
+    */
+    static void _addKeysToIndexStepOneOfTwo(BSONObjSet & /*out*/keys, NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, IndexDetails& idx) {
+        idx.getKeysFromObject(obj, keys);
+        if( keys.empty() )
+            return;
+        bool dupsAllowed = !idx.unique();
+        BSONObj order = idx.keyPattern();
+        IndexInterface& ii = idx.idxInterface();
+        Ordering ordering = Ordering::make(order);
+
+        assert( !recordLoc.isNull() );
+
+        try {
+            // we can't do the two step method with multi keys as insertion of one key changes the indexes 
+            // structure.  however we can do the first key of the set so we go ahead and do that FWIW
+            ii.phasedQueueItemToInsert(idxNo, idx.head, recordLoc, *keys.begin(), ordering, idx, dupsAllowed);
+        }
+        catch (AssertionException& e) {
+            if( e.getCode() == 10287 && idxNo == d->nIndexes ) {
+                DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
+            }
+            else {
+                throw;
+            }
+        }
+    }
+
+    namespace dur { 
+        extern unsigned notesThisLock;
+    }
+
+    void upgradeToWritable(bool shouldBeUnlocked) {
+        // todo upgrade!
+        DEV {
+            // verify we haven't written yet (usually)
+
+            // test binary does special things so this would assert there so don't check there
+            if( shouldBeUnlocked && !cmdLine.binaryName.empty() && cmdLine.binaryName != "test" ) {
+                static unsigned long long zeroes;
+                static unsigned long long tot;
+                tot++;
+                if( dur::notesThisLock == 0 )
+                    zeroes++;
+                if( tot > 1000 ) {
+                    static int n;
+                    DEV if( n++ == 0 ) 
+                        log() << "warning upgradeToWritable: already in writable too often" << endl;
+                }
+            }
+        }
+    }
+
+    /** add index keys for a newly inserted record 
+        done in two steps/phases to defer write lock portion
+    */
+    static void indexRecordUsingTwoSteps(NamespaceDetails *d, BSONObj obj, DiskLoc loc, bool shouldBeUnlocked) {
+        vector<int> multi;
+        vector<BSONObjSet> multiKeys;
+
+        IndexInterface::phasedBegin();
+
+        int n = d->nIndexesBeingBuilt();
+        {
+            BSONObjSet keys;
+            for ( int i = 0; i < n; i++ ) {
+                IndexDetails& idx = d->idx(i);
+                // this call throws on unique constraint violation.  we haven't done any writes yet so that is fine.
+                _addKeysToIndexStepOneOfTwo(/*out*/keys, d, i, obj, loc, idx);
+                if( keys.size() > 1 ) {
+                    multi.push_back(i);
+                    multiKeys.push_back(BSONObjSet());
+                    multiKeys[multiKeys.size()-1].swap(keys);
+                }
+                keys.clear();
+            }
+        }
+
+        // update lock to writable here.  TODO
+        
+        upgradeToWritable(shouldBeUnlocked);
+
+        IndexInterface::phasedFinish(); // step 2
+
+        // now finish adding multikeys
+        for( unsigned j = 0; j < multi.size(); j++ ) {
+            unsigned i = multi[j];
+            BSONObjSet& keys = multiKeys[j];
+            IndexDetails& idx = d->idx(i);
+            IndexInterface& ii = idx.idxInterface();
+            Ordering ordering = Ordering::make(idx.keyPattern());
+            d->setIndexIsMultikey(i);   
+            for( BSONObjSet::iterator k = ++keys.begin()/*skip 1*/; k != keys.end(); k++ ) {
+                try {
+                    ii.bt_insert(idx.head, loc, *k, ordering, !idx.unique(), idx);
+                } catch (AssertionException& e) {
+                    if( e.getCode() == 10287 && (int) i == d->nIndexes ) {
+                        DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
+                    }
+                    else {
+                        /* roll back previously added index entries
+                           note must do self index as it is multikey and could require some cleanup itself
+                        */
+                        for( int j = 0; j < n; j++ ) {
+                            try {
+                                _unindexRecord(d->idx(j), obj, loc, false);
+                            }
+                            catch(...) {
+                                log(3) << "unindex fails on rollback after unique key constraint prevented insert\n";
+                            }
+                        }
+                        throw;
+                    }
+                }
+            }
+        }
+    }
+
+    /* add keys to index idxNo for a new record */
+    static void addKeysToIndex(NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, bool dupsAllowed) {
+        IndexDetails& idx = d->idx(idxNo);
+        BSONObjSet keys;
+        idx.getKeysFromObject(obj, keys);
+        if( keys.empty() ) 
+            return;
+        BSONObj order = idx.keyPattern();
+        IndexInterface& ii = idx.idxInterface();
+        Ordering ordering = Ordering::make(order);
+        int n = 0;
+        for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
+            if( ++n == 2 ) {
+                d->setIndexIsMultikey(idxNo);
+            }
+            assert( !recordLoc.isNull() );
+            try {
+                ii.bt_insert(idx.head, recordLoc, *i, ordering, dupsAllowed, idx);
+            }
+            catch (AssertionException& e) {
+                if( e.getCode() == 10287 && idxNo == d->nIndexes ) {
+                    DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
+                    continue;
+                }
+                if( !dupsAllowed ) {
+                    // dup key exception, presumably.
+                    throw;
+                }
+                problem() << " caught assertion addKeysToIndex " << idx.indexNamespace() << " " << obj["_id"] << endl;
+            }
+        }
+    }
+
+#if 0    
+    void testSorting() {
+        BSONObjBuilder b;
+        b.appendNull("");
+        BSONObj x = b.obj();
+
+        BSONObjExternalSorter sorter(*IndexDetails::iis[1]);
+
+        sorter.add(x, DiskLoc(3,7));
+        sorter.add(x, DiskLoc(4,7));
+        sorter.add(x, DiskLoc(2,7));
+        sorter.add(x, DiskLoc(1,7));
+        sorter.add(x, DiskLoc(3,77));
+
+        sorter.sort();
+
+        auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+        while( i->more() ) {
+            BSONObjExternalSorter::Data d = i->next();
+            /*cout << d.second.toString() << endl;
+            cout << d.first.objsize() << endl;
+            cout<<"SORTER next:" << d.first.toString() << endl;*/
+        }
+    }
+#endif
+
+    SortPhaseOne *precalced = 0;
+
+    template< class V >
+    void buildBottomUpPhases2And3(bool dupsAllowed, IndexDetails& idx, BSONObjExternalSorter& sorter, 
+        bool dropDups, list<DiskLoc> &dupsToDrop, CurOp * op, SortPhaseOne *phase1, ProgressMeterHolder &pm,
+        Timer& t
+        )
+    {
+        BtreeBuilder<V> btBuilder(dupsAllowed, idx);
+        BSONObj keyLast;
+        auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
+        assert( pm == op->setMessage( "index: (2/3) btree bottom up" , phase1->nkeys , 10 ) );
+        while( i->more() ) {
+            RARELY killCurrentOp.checkForInterrupt();
+            BSONObjExternalSorter::Data d = i->next();
+
+            try {
+                if ( !dupsAllowed && dropDups ) {
+                    LastError::Disabled led( lastError.get() );
+                    btBuilder.addKey(d.first, d.second);
+                }
+                else {
+                    btBuilder.addKey(d.first, d.second);                    
+                }
+            }
+            catch( AssertionException& e ) {
+                if ( dupsAllowed ) {
+                    // unknow exception??
+                    throw;
+                }
+
+                if( e.interrupted() ) {
+                    killCurrentOp.checkForInterrupt();
+                }
+
+                if ( ! dropDups )
+                    throw;
+
+                /* we could queue these on disk, but normally there are very few dups, so instead we
+                    keep in ram and have a limit.
+                */
+                dupsToDrop.push_back(d.second);
+                uassert( 10092 , "too may dups on index build with dropDups=true", dupsToDrop.size() < 1000000 );
+            }
+            pm.hit();
+        }
+        pm.finished();
+        op->setMessage( "index: (3/3) btree-middle" );
+        log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl;
+        btBuilder.commit();
+        if ( btBuilder.getn() != phase1->nkeys && ! dropDups ) {
+            warning() << "not all entries were added to the index, probably some keys were too large" << endl;
+        }
+    }
+
+    // throws DBException
+    unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
+        CurOp * op = cc().curop();
+
+        Timer t;
+
+        tlog(1) << "fastBuildIndex " << ns << " idxNo:" << idxNo << ' ' << idx.info.obj().toString() << endl;
+
+        bool dupsAllowed = !idx.unique();
+        bool dropDups = idx.dropDups() || inDBRepair;
+        BSONObj order = idx.keyPattern();
+
+        getDur().writingDiskLoc(idx.head).Null();
+
+        if ( logLevel > 1 ) printMemInfo( "before index start" );
+
+        /* get and sort all the keys ----- */
+        ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) );
+        SortPhaseOne _ours;
+        SortPhaseOne *phase1 = precalced;
+        if( phase1 == 0 ) {
+            phase1 = &_ours;
+            SortPhaseOne& p1 = *phase1;
+            shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
+            p1.sorter.reset( new BSONObjExternalSorter(idx.idxInterface(), order) );
+            p1.sorter->hintNumObjects( d->stats.nrecords );
+            const IndexSpec& spec = idx.getSpec();
+            while ( c->ok() ) {
+                BSONObj o = c->current();
+                DiskLoc loc = c->currLoc();
+                p1.addKeys(spec, o, loc);
+                c->advance();
+                pm.hit();
+                if ( logLevel > 1 && p1.n % 10000 == 0 ) {
+                    printMemInfo( "\t iterating objects" );
+                }
+            };
+        }
+        pm.finished();
+
+        BSONObjExternalSorter& sorter = *(phase1->sorter);
+
+        if( phase1->multi )
+            d->setIndexIsMultikey(idxNo);
+
+        if ( logLevel > 1 ) printMemInfo( "before final sort" );
+        phase1->sorter->sort();
+        if ( logLevel > 1 ) printMemInfo( "after final sort" );
+
+        log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl;
+
+        list<DiskLoc> dupsToDrop;
+
+        /* build index --- */
+        if( idx.version() == 0 )
+            buildBottomUpPhases2And3<V0>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t);
+        else if( idx.version() == 1 ) 
+            buildBottomUpPhases2And3<V1>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t);
+        else
+            assert(false);
+
+        log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl;
+
+        for( list<DiskLoc>::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ ){
+            theDataFileMgr.deleteRecord( ns, i->rec(), *i, false /* cappedOk */ , true /* noWarn */ , isMaster( ns ) /* logOp */ );
+            getDur().commitIfNeeded();
+        }
+
+        return phase1->n;
+    }
+
+    class BackgroundIndexBuildJob : public BackgroundOperation {
+
+        unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
+            bool dupsAllowed = !idx.unique();
+            bool dropDups = idx.dropDups();
+
+            ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->stats.nrecords );
+
+            unsigned long long n = 0;
+            auto_ptr<ClientCursor> cc;
+            {
+                shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
+                cc.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, ns) );
+            }
+            CursorId id = cc->cursorid();
+
+            while ( cc->ok() ) {
+                BSONObj js = cc->current();
+                try {
+                    {
+                        if ( !dupsAllowed && dropDups ) {
+                            LastError::Disabled led( lastError.get() );
+                            addKeysToIndex(d, idxNo, js, cc->currLoc(), dupsAllowed);
+                        }
+                        else {
+                            addKeysToIndex(d, idxNo, js, cc->currLoc(), dupsAllowed);
+                        }
+                    }
+                    cc->advance();
+                }
+                catch( AssertionException& e ) {
+                    if( e.interrupted() ) {
+                        killCurrentOp.checkForInterrupt();
+                    }
+
+                    if ( dropDups ) {
+                        DiskLoc toDelete = cc->currLoc();
+                        bool ok = cc->advance();
+                        cc->updateLocation();
+                        theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true , true );
+                        if( ClientCursor::find(id, false) == 0 ) {
+                            cc.release();
+                            if( !ok ) {
+                                /* we were already at the end. normal. */
+                            }
+                            else {
+                                uasserted(12585, "cursor gone during bg index; dropDups");
+                            }
+                            break;
+                        }
+                    }
+                    else {
+                        log() << "background addExistingToIndex exception " << e.what() << endl;
+                        throw;
+                    }
+                }
+                n++;
+                progress.hit();
+
+                getDur().commitIfNeeded();
+
+                if ( cc->yieldSometimes( ClientCursor::WillNeed ) ) {
+                    progress.setTotalWhileRunning( d->stats.nrecords );
+                }
+                else {
+                    cc.release();
+                    uasserted(12584, "cursor gone during bg index");
+                    break;
+                }
+            }
+            progress.finished();
+            return n;
+        }
+
+        /* we do set a flag in the namespace for quick checking, but this is our authoritative info -
+           that way on a crash/restart, we don't think we are still building one. */
+        set<NamespaceDetails*> bgJobsInProgress;
+
+        void prep(const char *ns, NamespaceDetails *d) {
+            assertInWriteLock();
+            uassert( 13130 , "can't start bg index b/c in recursive lock (db.eval?)" , mongo::d.dbMutex.getState() == 1 );
+            bgJobsInProgress.insert(d);
+        }
+        void done(const char *ns, NamespaceDetails *d) {
+            NamespaceDetailsTransient::get(ns).addedIndex(); // clear query optimizer cache
+            assertInWriteLock();
+        }
+
+    public:
+        BackgroundIndexBuildJob(const char *ns) : BackgroundOperation(ns) { }
+
+        unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
+            unsigned long long n = 0;
+
+            prep(ns.c_str(), d);
+            assert( idxNo == d->nIndexes );
+            try {
+                idx.head.writing() = idx.idxInterface().addBucket(idx);
+                n = addExistingToIndex(ns.c_str(), d, idx, idxNo);
+            }
+            catch(...) {
+                if( cc().database() && nsdetails(ns.c_str()) == d ) {
+                    assert( idxNo == d->nIndexes );
+                    done(ns.c_str(), d);
+                }
+                else {
+                    log() << "ERROR: db gone during bg index?" << endl;
+                }
+                throw;
+            }
+            assert( idxNo == d->nIndexes );
+            done(ns.c_str(), d);
+            return n;
+        }
+    };
+
+    /**
+     * For the lifetime of this object, an index build is indicated on the specified
+     * namespace and the newest index is marked as absent.  This simplifies
+     * the cleanup required on recovery.
+     */
+    class RecoverableIndexState {
+    public:
+        RecoverableIndexState( NamespaceDetails *d ) : _d( d ) {
+            indexBuildInProgress() = 1;
+            nIndexes()--;
+        }
+        ~RecoverableIndexState() {
+            DESTRUCTOR_GUARD (
+                nIndexes()++;
+                indexBuildInProgress() = 0;
+            )
+        }
+    private:
+        int &nIndexes() { return getDur().writingInt( _d->nIndexes ); }
+        int &indexBuildInProgress() { return getDur().writingInt( _d->indexBuildInProgress ); }
+        NamespaceDetails *_d;
+    };
+
+    // throws DBException
+    static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) {
+        tlog() << "build index " << ns << ' ' << idx.keyPattern() << ( background ? " background" : "" ) << endl;
+        Timer t;
+        unsigned long long n;
+
+        assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be...
+        assert( d->indexBuildInProgress == 0 );
+        assertInWriteLock();
+        RecoverableIndexState recoverable( d );
+
+        // Build index spec here in case the collection is empty and the index details are invalid
+        idx.getSpec();
+
+        if( inDBRepair || !background ) {
+            n = fastBuildIndex(ns.c_str(), d, idx, idxNo);
+            assert( !idx.head.isNull() );
+        }
+        else {
+            BackgroundIndexBuildJob j(ns.c_str());
+            n = j.go(ns, d, idx, idxNo);
+        }
+        tlog() << "build index done " << n << " records " << t.millis() / 1000.0 << " secs" << endl;
+    }
+
+    /* add keys to indexes for a new record */
+#if 0
+    static void oldIndexRecord__notused(NamespaceDetails *d, BSONObj obj, DiskLoc loc) {
+        int n = d->nIndexesBeingBuilt();
+        for ( int i = 0; i < n; i++ ) {
+            try {
+                bool unique = d->idx(i).unique();
+                addKeysToIndex(d, i, obj, loc, /*dupsAllowed*/!unique);
+            }
+            catch( DBException& ) {
+                /* try to roll back previously added index entries
+                   note <= i (not < i) is important here as the index we were just attempted
+                   may be multikey and require some cleanup.
+                */
+                for( int j = 0; j <= i; j++ ) {
+                    try {
+                        _unindexRecord(d->idx(j), obj, loc, false);
+                    }
+                    catch(...) {
+                        log(3) << "unindex fails on rollback after unique failure\n";
+                    }
+                }
+                throw;
+            }
+        }
+    }
+#endif
+
+    extern BSONObj id_obj; // { _id : 1 }
+
+    void ensureHaveIdIndex(const char *ns) {
+        NamespaceDetails *d = nsdetails(ns);
+        if ( d == 0 || (d->flags & NamespaceDetails::Flag_HaveIdIndex) )
+            return;
+
+        *getDur().writing(&d->flags) |= NamespaceDetails::Flag_HaveIdIndex;
+
+        {
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                if( i.next().isIdIndex() )
+                    return;
+            }
+        }
+
+        string system_indexes = cc().database()->name + ".system.indexes";
+
+        BSONObjBuilder b;
+        b.append("name", "_id_");
+        b.append("ns", ns);
+        b.append("key", id_obj);
+        BSONObj o = b.done();
+
+        /* edge case: note the insert could fail if we have hit maxindexes already */
+        theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize(), true);
+    }
+
+#pragma pack(1)
+    struct IDToInsert_ {
+        char type;
+        char _id[4];
+        OID oid;
+        IDToInsert_() {
+            type = (char) jstOID;
+            strcpy(_id, "_id");
+            assert( sizeof(IDToInsert_) == 17 );
+        }
+    } idToInsert_;
+    struct IDToInsert : public BSONElement {
+        IDToInsert() : BSONElement( ( char * )( &idToInsert_ ) ) {}
+    } idToInsert;
+#pragma pack()
+
+    void DataFileMgr::insertAndLog( const char *ns, const BSONObj &o, bool god ) {
+        BSONObj tmp = o;
+        insertWithObjMod( ns, tmp, god );
+        logOp( "i", ns, tmp );
+    }
+
+    /** @param o the object to insert. can be modified to add _id and thus be an in/out param
+     */
+    DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) {
+        bool addedID = false;
+        DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god, true, &addedID );
+        if( addedID && !loc.isNull() )
+            o = BSONObj( loc.rec() );
+        return loc;
+    }
+
+    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject );
+
+    // We are now doing two btree scans for all unique indexes (one here, and one when we've
+    // written the record to the collection.  This could be made more efficient inserting
+    // dummy data here, keeping pointers to the btree nodes holding the dummy data and then
+    // updating the dummy data with the DiskLoc of the real record.
+    void checkNoIndexConflicts( NamespaceDetails *d, const BSONObj &obj ) {
+        for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) {
+            if( d->idx(idxNo).unique() ) {
+                IndexDetails& idx = d->idx(idxNo);
+                BSONObjSet keys;
+                idx.getKeysFromObject(obj, keys);
+                BSONObj order = idx.keyPattern();
+                IndexInterface& ii = idx.idxInterface();
+                for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) {
+                    // WARNING: findSingle may not be compound index safe.  this may need to change.  see notes in 
+                    // findSingle code.
+                    uassert( 12582, "duplicate key insert for unique index of capped collection",
+                             ii.findSingle(idx, idx.head, *i ).isNull() );
+                }
+            }
+        }
+    }
+
+    /** add a record to the end of the linked list chain within this extent. 
+        require: you must have already declared write intent for the record header.        
+    */
+    void addRecordToRecListInExtent(Record *r, DiskLoc loc) {
+        dassert( loc.rec() == r );
+        Extent *e = r->myExtent(loc);
+        if ( e->lastRecord.isNull() ) {
+            Extent::FL *fl = getDur().writing(e->fl());
+            fl->firstRecord = fl->lastRecord = loc;
+            r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
+        }
+        else {
+            Record *oldlast = e->lastRecord.rec();
+            r->prevOfs = e->lastRecord.getOfs();
+            r->nextOfs = DiskLoc::NullOfs;
+            getDur().writingInt(oldlast->nextOfs) = loc.getOfs();
+            getDur().writingDiskLoc(e->lastRecord) = loc;
+        }
+    }
+
+    NOINLINE_DECL DiskLoc outOfSpace(const char *ns, NamespaceDetails *d, int lenWHdr, bool god, DiskLoc extentLoc) {
+        DiskLoc loc;
+        if ( d->capped == 0 ) { // size capped doesn't grow
+            log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl;
+            cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god);
+            loc = d->alloc(ns, lenWHdr, extentLoc);
+            if ( loc.isNull() ) {
+                log() << "warning: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n";
+                for ( int z=0; z<10 && lenWHdr > d->lastExtentSize; z++ ) {
+                    log() << "try #" << z << endl;
+                    cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god);
+                    loc = d->alloc(ns, lenWHdr, extentLoc);
+                    if ( ! loc.isNull() )
+                        break;
+                }
+            }
+        }
+        return loc;
+    }
+
+    /** used by insert and also compact
+      * @return null loc if out of space 
+      */
+    DiskLoc allocateSpaceForANewRecord(const char *ns, NamespaceDetails *d, int lenWHdr, bool god) {
+        DiskLoc extentLoc;
+        DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
+        if ( loc.isNull() ) {
+            loc = outOfSpace(ns, d, lenWHdr, god, extentLoc);
+        }
+        return loc;
+    }
+
+    bool NOINLINE_DECL insert_checkSys(const char *sys, const char *ns, bool& wouldAddIndex, const void *obuf, bool god) {
+        uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns);
+        if ( strstr(ns, ".system.") ) {
+            // later:check for dba-type permissions here if have that at some point separate
+            if ( strstr(ns, ".system.indexes" ) )
+                wouldAddIndex = true;
+            else if ( legalClientSystemNS( ns , true ) ) {
+                if ( obuf && strstr( ns , ".system.users" ) ) {
+                    BSONObj t( reinterpret_cast<const char *>( obuf ) );
+                    uassert( 14051 , "system.user entry needs 'user' field to be a string" , t["user"].type() == String );
+                    uassert( 14052 , "system.user entry needs 'pwd' field to be a string" , t["pwd"].type() == String );
+                    uassert( 14053 , "system.user entry needs 'user' field to be non-empty" , t["user"].String().size() );
+                    uassert( 14054 , "system.user entry needs 'pwd' field to be non-empty" , t["pwd"].String().size() );
+                }
+            }
+            else if ( !god ) {
+                // todo this should probably uasseert rather than doing this:
+                log() << "ERROR: attempt to insert in system namespace " << ns << endl;
+                return false;
+            }
+        }
+        return true;
+    }
+
+    NOINLINE_DECL NamespaceDetails* insert_newNamespace(const char *ns, int len, bool god) { 
+        addNewNamespaceToCatalog(ns);
+        /* todo: shouldn't be in the namespace catalog until after the allocations here work.
+            also if this is an addIndex, those checks should happen before this!
+        */
+        // This may create first file in the database.
+        int ies = Extent::initialSize(len);
+        if( str::contains(ns, '$') && len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 && len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) { 
+            // probably an index.  so we pick a value here for the first extent instead of using initialExtentSize() which is more 
+            // for user collections.  TODO: we could look at the # of records in the parent collection to be smarter here.
+            ies = (32+4) * 1024;
+        }
+        cc().database()->allocExtent(ns, ies, false, false);
+        NamespaceDetails *d = nsdetails(ns);
+        if ( !god )
+            ensureIdIndexForNewNs(ns);
+        return d;
+    }
+
+    void NOINLINE_DECL insert_makeIndex(NamespaceDetails *tableToIndex, const string& tabletoidxns, const DiskLoc& loc) { 
+        uassert( 13143 , "can't create index on system.indexes" , tabletoidxns.find( ".system.indexes" ) == string::npos );
+
+        BSONObj info = loc.obj();
+        bool background = info["background"].trueValue();
+        // if this is not readable, let's move things along
+        if (background && ((!theReplSet && cc().isSyncThread()) || (theReplSet && !theReplSet->isSecondary()))) {
+            log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl;
+            background = false;
+        }
+
+        int idxNo = tableToIndex->nIndexes;
+        IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes
+        getDur().writingDiskLoc(idx.info) = loc;
+        try {
+            buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background);
+        }
+        catch( DBException& e ) {
+            // save our error msg string as an exception or dropIndexes will overwrite our message
+            LastError *le = lastError.get();
+            int savecode = 0;
+            string saveerrmsg;
+            if ( le ) {
+                savecode = le->code;
+                saveerrmsg = le->msg;
+            }
+            else {
+                savecode = e.getCode();
+                saveerrmsg = e.what();
+            }
+
+            // roll back this index
+            string name = idx.indexName();
+            BSONObjBuilder b;
+            string errmsg;
+            bool ok = dropIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true);
+            if( !ok ) {
+                log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl;
+            }
+
+            assert( le && !saveerrmsg.empty() );
+            raiseError(savecode,saveerrmsg.c_str());
+            throw;
+        }
+    }
+
+    /* if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc
+         after the call -- that will prevent a double buffer copy in some cases (btree.cpp).
+
+       @param mayAddIndex almost always true, except for invocation from rename namespace command.
+       @param addedID if not null, set to true if adding _id element. you must assure false before calling
+              if using.
+    */
+
+    DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, bool mayAddIndex, bool *addedID) {
+        bool wouldAddIndex = false;
+        massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) );
+        uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );
+        {
+            const char *sys = strstr(ns, "system.");
+            if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) )
+                return DiskLoc();
+        }
+        bool addIndex = wouldAddIndex && mayAddIndex;
+
+        NamespaceDetails *d = nsdetails(ns);
+        if ( d == 0 ) {
+            d = insert_newNamespace(ns, len, god);
+        }
+
+        NamespaceDetails *tableToIndex = 0;
+
+        string tabletoidxns;
+        BSONObj fixedIndexObject;
+        if ( addIndex ) {
+            assert( obuf );
+            BSONObj io((const char *) obuf);
+            if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) ) {
+                // prepare creates _id itself, or this indicates to fail the build silently (such 
+                // as if index already exists)
+                return DiskLoc();
+            }
+            if ( ! fixedIndexObject.isEmpty() ) {
+                obuf = fixedIndexObject.objdata();
+                len = fixedIndexObject.objsize();
+            }
+        }
+
+        int addID = 0; // 0 if not adding _id; if adding, the length of that new element
+        if( !god ) {
+            /* Check if we have an _id field. If we don't, we'll add it.
+               Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
+            */
+            BSONObj io((const char *) obuf);
+            BSONElement idField = io.getField( "_id" );
+            uassert( 10099 ,  "_id cannot be an array", idField.type() != Array );
+            // we don't add _id for capped collections as they don't have an _id index
+            if( idField.eoo() && !wouldAddIndex && strstr(ns, ".local.") == 0 && d->haveIdIndex() ) {
+                if( addedID )
+                    *addedID = true;
+                addID = len;
+                idToInsert_.oid.init();
+                len += idToInsert.size();
+            }
+
+            BSONElementManipulator::lookForTimestamps( io );
+        }
+
+        int lenWHdr = len + Record::HeaderSize;
+        lenWHdr = (int) (lenWHdr * d->paddingFactor);
+        if ( lenWHdr == 0 ) {
+            // old datafiles, backward compatible here.
+            assert( d->paddingFactor == 0 );
+            *getDur().writing(&d->paddingFactor) = 1.0;
+            lenWHdr = len + Record::HeaderSize;
+        }
+
+        // If the collection is capped, check if the new object will violate a unique index
+        // constraint before allocating space.
+        if ( d->nIndexes && d->capped && !god ) {
+            checkNoIndexConflicts( d, BSONObj( reinterpret_cast<const char *>( obuf ) ) );
+        }
+
+        bool earlyIndex = true;
+        DiskLoc loc;
+        if( addID || tableToIndex || d->capped ) {
+            // if need id, we don't do the early indexing. this is not the common case so that is sort of ok
+            earlyIndex = false;
+            loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god);
+        }
+        else {
+            loc = d->allocWillBeAt(ns, lenWHdr);
+            if( loc.isNull() ) {
+                // need to get a new extent so we have to do the true alloc now (not common case)
+                earlyIndex = false;
+                loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god);
+            }
+        }
+        if ( loc.isNull() ) {
+            log() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->capped << endl;
+            assert(d->capped);
+            return DiskLoc();
+        }
+
+        if( earlyIndex ) { 
+            // add record to indexes using two step method so we can do the reading outside a write lock
+            if ( d->nIndexes ) {
+                assert( obuf );
+                BSONObj obj((const char *) obuf);
+                try {
+                    indexRecordUsingTwoSteps(d, obj, loc, true);
+                }
+                catch( AssertionException& ) {
+                    // should be a dup key error on _id index
+                    dassert( !tableToIndex && !d->capped );
+                    // no need to delete/rollback the record as it was not added yet
+                    throw;
+                }
+            }
+            // really allocate now
+            DiskLoc real = allocateSpaceForANewRecord(ns, d, lenWHdr, god);
+            assert( real == loc );
+        }
+
+        Record *r = loc.rec();
+        {
+            assert( r->lengthWithHeaders >= lenWHdr );
+            r = (Record*) getDur().writingPtr(r, lenWHdr);
+            if( addID ) {
+                /* a little effort was made here to avoid a double copy when we add an ID */
+                ((int&)*r->data) = *((int*) obuf) + idToInsert.size();
+                memcpy(r->data+4, idToInsert.rawdata(), idToInsert.size());
+                memcpy(r->data+4+idToInsert.size(), ((char *)obuf)+4, addID-4);
+            }
+            else {
+                if( obuf ) // obuf can be null from internal callers
+                    memcpy(r->data, obuf, len);
+            }
+        }
+
+        addRecordToRecListInExtent(r, loc);
+
+        /* durability todo : this could be a bit annoying / slow to record constantly */
+        {
+            NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+            s->datasize += r->netLength();
+            s->nrecords++;
+        }
+
+        // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket
+        if ( !god )
+            NamespaceDetailsTransient::get( ns ).notifyOfWriteOp();
+
+        if ( tableToIndex ) {
+            insert_makeIndex(tableToIndex, tabletoidxns, loc);
+        }
+
+        /* add this record to our indexes */
+        if ( !earlyIndex && d->nIndexes ) {
+            try {
+                BSONObj obj(r->data);
+                // not sure which of these is better -- either can be used.  oldIndexRecord may be faster, 
+                // but twosteps handles dup key errors more efficiently.
+                //oldIndexRecord(d, obj, loc);
+                indexRecordUsingTwoSteps(d, obj, loc, false);
+
+            }
+            catch( AssertionException& e ) {
+                // should be a dup key error on _id index
+                if( tableToIndex || d->capped ) {
+                    massert( 12583, "unexpected index insertion failure on capped collection", !d->capped );
+                    string s = e.toString();
+                    s += " : on addIndex/capped - collection and its index will not match";
+                    uassert_nothrow(s.c_str());
+                    error() << s << endl;
+                }
+                else {
+                    // normal case -- we can roll back
+                    _deleteRecord(d, ns, r, loc);
+                    throw;
+                }
+            }
+        }
+
+        d->paddingFits();
+
+        return loc;
+    }
+
+    /* special version of insert for transaction logging -- streamlined a bit.
+       assumes ns is capped and no indexes
+    */
+    Record* DataFileMgr::fast_oplog_insert(NamespaceDetails *d, const char *ns, int len) {
+        assert( d );
+        RARELY assert( d == nsdetails(ns) );
+        DEV assert( d == nsdetails(ns) );
+
+        DiskLoc extentLoc;
+        int lenWHdr = len + Record::HeaderSize;
+        DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
+        assert( !loc.isNull() );
+
+        Record *r = loc.rec();
+        assert( r->lengthWithHeaders >= lenWHdr );
+
+        Extent *e = r->myExtent(loc);
+        if ( e->lastRecord.isNull() ) {
+            Extent::FL *fl = getDur().writing( e->fl() );
+            fl->firstRecord = fl->lastRecord = loc;
+
+            Record::NP *np = getDur().writing(r->np());
+            np->nextOfs = np->prevOfs = DiskLoc::NullOfs;
+        }
+        else {
+            Record *oldlast = e->lastRecord.rec();
+            Record::NP *np = getDur().writing(r->np());
+            np->prevOfs = e->lastRecord.getOfs();
+            np->nextOfs = DiskLoc::NullOfs;
+            getDur().writingInt( oldlast->nextOfs ) = loc.getOfs();
+            e->lastRecord.writing() = loc;
+        }
+
+        /* todo: don't update for oplog?  seems wasteful. */
+        {
+            NamespaceDetails::Stats *s = getDur().writing(&d->stats);
+            s->datasize += r->netLength();
+            s->nrecords++;
+        }
+
+        return r;
+    }
+
+} // namespace mongo
+
+#include "clientcursor.h"
+
+namespace mongo {
+
+    void dropAllDatabasesExceptLocal() {
+        writelock lk("");
+
+        vector<string> n;
+        getDatabaseNames(n);
+        if( n.size() == 0 ) return;
+        log() << "dropAllDatabasesExceptLocal " << n.size() << endl;
+        for( vector<string>::iterator i = n.begin(); i != n.end(); i++ ) {
+            if( *i != "local" ) {
+                Client::Context ctx(*i);
+                dropDatabase(*i);
+            }
+        }
+    }
+
+    void dropDatabase(string db) {
+        log(1) << "dropDatabase " << db << endl;
+        Database *d = cc().database();
+        assert( d );
+        assert( d->name == db );
+
+        BackgroundOperation::assertNoBgOpInProgForDb(d->name.c_str());
+
+        mongo::d.dbMutex.assertWriteLocked();
+
+        // Not sure we need this here, so removed.  If we do, we need to move it down 
+        // within other calls both (1) as they could be called from elsewhere and 
+        // (2) to keep the lock order right - groupcommitmutex must be locked before 
+        // mmmutex (if both are locked).
+        //
+        //  RWLockRecursive::Exclusive lk(MongoFile::mmmutex);
+
+        getDur().syncDataAndTruncateJournal();
+
+        Database::closeDatabase( d->name.c_str(), d->path );
+        d = 0; // d is now deleted
+
+        _deleteDataFiles( db.c_str() );
+    }
+
+    typedef boost::filesystem::path Path;
+
+    void boostRenameWrapper( const Path &from, const Path &to ) {
+        try {
+            boost::filesystem::rename( from, to );
+        }
+        catch ( const boost::filesystem::filesystem_error & ) {
+            // boost rename doesn't work across partitions
+            boost::filesystem::copy_file( from, to);
+            boost::filesystem::remove( from );
+        }
+    }
+
+    // back up original database files to 'temp' dir
+    void _renameForBackup( const char *database, const Path &reservedPath ) {
+        Path newPath( reservedPath );
+        if ( directoryperdb )
+            newPath /= database;
+        class Renamer : public FileOp {
+        public:
+            Renamer( const Path &newPath ) : newPath_( newPath ) {}
+        private:
+            const boost::filesystem::path &newPath_;
+            virtual bool apply( const Path &p ) {
+                if ( !boost::filesystem::exists( p ) )
+                    return false;
+                boostRenameWrapper( p, newPath_ / ( p.leaf() + ".bak" ) );
+                return true;
+            }
+            virtual const char * op() const {
+                return "renaming";
+            }
+        } renamer( newPath );
+        _applyOpToDataFiles( database, renamer, true );
+    }
+
+    // move temp files to standard data dir
+    void _replaceWithRecovered( const char *database, const char *reservedPathString ) {
+        Path newPath( dbpath );
+        if ( directoryperdb )
+            newPath /= database;
+        class Replacer : public FileOp {
+        public:
+            Replacer( const Path &newPath ) : newPath_( newPath ) {}
+        private:
+            const boost::filesystem::path &newPath_;
+            virtual bool apply( const Path &p ) {
+                if ( !boost::filesystem::exists( p ) )
+                    return false;
+                boostRenameWrapper( p, newPath_ / p.leaf() );
+                return true;
+            }
+            virtual const char * op() const {
+                return "renaming";
+            }
+        } replacer( newPath );
+        _applyOpToDataFiles( database, replacer, true, reservedPathString );
+    }
+
+    // generate a directory name for storing temp data files
+    Path uniqueReservedPath( const char *prefix ) {
+        Path repairPath = Path( repairpath );
+        Path reservedPath;
+        int i = 0;
+        bool exists = false;
+        do {
+            stringstream ss;
+            ss << prefix << "_repairDatabase_" << i++;
+            reservedPath = repairPath / ss.str();
+            BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) );
+        }
+        while ( exists );
+        return reservedPath;
+    }
+
+    boost::intmax_t dbSize( const char *database ) {
+        class SizeAccumulator : public FileOp {
+        public:
+            SizeAccumulator() : totalSize_( 0 ) {}
+            boost::intmax_t size() const {
+                return totalSize_;
+            }
+        private:
+            virtual bool apply( const boost::filesystem::path &p ) {
+                if ( !boost::filesystem::exists( p ) )
+                    return false;
+                totalSize_ += boost::filesystem::file_size( p );
+                return true;
+            }
+            virtual const char *op() const {
+                return "checking size";
+            }
+            boost::intmax_t totalSize_;
+        };
+        SizeAccumulator sa;
+        _applyOpToDataFiles( database, sa );
+        return sa.size();
+    }
+
+    bool repairDatabase( string dbNameS , string &errmsg,
+                         bool preserveClonedFilesOnFailure, bool backupOriginalFiles ) {
+        doingRepair dr;
+        dbNameS = nsToDatabase( dbNameS );
+        const char * dbName = dbNameS.c_str();
+
+        stringstream ss;
+        ss << "localhost:" << cmdLine.port;
+        string localhost = ss.str();
+
+        problem() << "repairDatabase " << dbName << endl;
+        assert( cc().database()->name == dbName );
+        assert( cc().database()->path == dbpath );
+
+        BackgroundOperation::assertNoBgOpInProgForDb(dbName);
+
+        getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
+        boost::intmax_t totalSize = dbSize( dbName );
+        boost::intmax_t freeSize = File::freeSpace(repairpath);
+        if ( freeSize > -1 && freeSize < totalSize ) {
+            stringstream ss;
+            ss << "Cannot repair database " << dbName << " having size: " << totalSize
+               << " (bytes) because free disk space is: " << freeSize << " (bytes)";
+            errmsg = ss.str();
+            problem() << errmsg << endl;
+            return false;
+        }
+
+        Path reservedPath =
+            uniqueReservedPath( ( preserveClonedFilesOnFailure || backupOriginalFiles ) ?
+                                "backup" : "_tmp" );
+        BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) );
+        string reservedPathString = reservedPath.native_directory_string();
+
+        bool res;
+        {
+            // clone to temp location, which effectively does repair
+            Client::Context ctx( dbName, reservedPathString );
+            assert( ctx.justCreated() );
+
+            res = cloneFrom(localhost.c_str(), errmsg, dbName,
+                            /*logForReplication=*/false, /*slaveOk*/false, /*replauth*/false,
+                            /*snapshot*/false, /*mayYield*/false, /*mayBeInterrupted*/true);
+            Database::closeDatabase( dbName, reservedPathString.c_str() );
+        }
+
+        if ( !res ) {
+            errmsg = str::stream() << "clone failed for " << dbName << " with error: " << errmsg;
+            problem() << errmsg << endl;
+
+            if ( !preserveClonedFilesOnFailure )
+                BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
+
+            getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
+            return false;
+        }
+
+        MongoFile::flushAll(true);
+
+        Client::Context ctx( dbName );
+        Database::closeDatabase( dbName, dbpath );
+
+        if ( backupOriginalFiles ) {
+            _renameForBackup( dbName, reservedPath );
+        }
+        else {
+            _deleteDataFiles( dbName );
+            BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( Path( dbpath ) / dbName ) );
+        }
+
+        _replaceWithRecovered( dbName, reservedPathString.c_str() );
+
+        if ( !backupOriginalFiles )
+            BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );
+
+        getDur().syncDataAndTruncateJournal(); // Must be done before and after repair
+
+        return true;
+    }
+
+    void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator, const string& path ) {
+        if ( afterAllocator )
+            FileAllocator::get()->waitUntilFinished();
+        string c = database;
+        c += '.';
+        boost::filesystem::path p(path);
+        if ( directoryperdb )
+            p /= database;
+        boost::filesystem::path q;
+        q = p / (c+"ns");
+        bool ok = false;
+        BOOST_CHECK_EXCEPTION( ok = fo.apply( q ) );
+        if ( ok )
+            log(2) << fo.op() << " file " << q.string() << endl;
+        int i = 0;
+        int extra = 10; // should not be necessary, this is defensive in case there are missing files
+        while ( 1 ) {
+            assert( i <= DiskLoc::MaxFiles );
+            stringstream ss;
+            ss << c << i;
+            q = p / ss.str();
+            BOOST_CHECK_EXCEPTION( ok = fo.apply(q) );
+            if ( ok ) {
+                if ( extra != 10 ) {
+                    log(1) << fo.op() << " file " << q.string() << endl;
+                    log() << "  _applyOpToDataFiles() warning: extra == " << extra << endl;
+                }
+            }
+            else if ( --extra <= 0 )
+                break;
+            i++;
+        }
+    }
+
+    NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); }
+
+    bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ) {
+        log() << "DatabaseHolder::closeAll path:" << path << endl;
+        d.dbMutex.assertWriteLocked();
+
+        map<string,Database*>& m = _paths[path];
+        _size -= m.size();
+
+        set< string > dbs;
+        for ( map<string,Database*>::iterator i = m.begin(); i != m.end(); i++ ) {
+            wassert( i->second->path == path );
+            dbs.insert( i->first );
+        }
+
+        currentClient.get()->getContext()->_clear();
+
+        BSONObjBuilder bb( result.subarrayStart( "dbs" ) );
+        int n = 0;
+        int nNotClosed = 0;
+        for( set< string >::iterator i = dbs.begin(); i != dbs.end(); ++i ) {
+            string name = *i;
+            log(2) << "DatabaseHolder::closeAll path:" << path << " name:" << name << endl;
+            Client::Context ctx( name , path );
+            if( !force && BackgroundOperation::inProgForDb(name.c_str()) ) {
+                log() << "WARNING: can't close database " << name << " because a bg job is in progress - try killOp command" << endl;
+                nNotClosed++;
+            }
+            else {
+                Database::closeDatabase( name.c_str() , path );
+                bb.append( bb.numStr( n++ ) , name );
+            }
+        }
+        bb.done();
+        if( nNotClosed )
+            result.append("nNotClosed", nNotClosed);
+        else {
+            ClientCursor::assertNoCursors();
+        }
+
+        return true;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/pdfile.h b/src/mongo/db/pdfile.h
new file mode 100644
index 00000000000..cd6062b1a48
--- /dev/null
+++ b/src/mongo/db/pdfile.h
@@ -0,0 +1,546 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* pdfile.h
+
+   Files:
+     database.ns - namespace index
+     database.1  - data files
+     database.2
+     ...
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "../util/mmap.h"
+#include "diskloc.h"
+#include "jsobjmanipulator.h"
+#include "namespace-inl.h"
+#include "client.h"
+#include "mongommf.h"
+
+namespace mongo {
+
+    class DataFileHeader;
+    class Extent;
+    class Record;
+    class Cursor;
+    class OpDebug;
+
+    void dropDatabase(string db);
+    bool repairDatabase(string db, string &errmsg, bool preserveClonedFilesOnFailure = false, bool backupOriginalFiles = false);
+
+    /* low level - only drops this ns */
+    void dropNS(const string& dropNs);
+
+    /* deletes this ns, indexes and cursors */
+    void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result );
+    bool userCreateNS(const char *ns, BSONObj j, string& err, bool logForReplication, bool *deferIdIndex = 0);
+    shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc=DiskLoc());
+
+    bool isValidNS( const StringData& ns );
+
+    /*---------------------------------------------------------------------*/
+
+    class MongoDataFile {
+        friend class DataFileMgr;
+        friend class BasicCursor;
+    public:
+        MongoDataFile(int fn) : _mb(0), fileNo(fn) { }
+
+        /** @return true if found and opened. if uninitialized (prealloc only) does not open. */
+        bool openExisting( const char *filename );
+
+        /** creates if DNE */
+        void open(const char *filename, int requestedDataSize = 0, bool preallocateOnly = false);
+
+        /* allocate a new extent from this datafile.
+           @param capped - true if capped collection
+           @param loops is our recursion check variable - you want to pass in zero
+        */
+        Extent* createExtent(const char *ns, int approxSize, bool capped = false, int loops = 0);
+
+        DataFileHeader *getHeader() { return header(); }
+
+        unsigned long long length() const { return mmf.length(); }
+
+        /* return max size an extent may be */
+        static int maxSize();
+
+        /** fsync */
+        void flush( bool sync );
+
+        /** only use fore debugging */
+        Extent* debug_getExtent(DiskLoc loc) { return _getExtent( loc ); }
+    private:
+        void badOfs(int) const;
+        void badOfs2(int) const;
+        int defaultSize( const char *filename ) const;
+
+        Extent* getExtent(DiskLoc loc) const;
+        Extent* _getExtent(DiskLoc loc) const;
+        Record* recordAt(DiskLoc dl);
+        Record* makeRecord(DiskLoc dl, int size);
+        void grow(DiskLoc dl, int size);
+
+        char* p() const { return (char *) _mb; }
+        DataFileHeader* header() { return (DataFileHeader*) _mb; }
+
+        MongoMMF mmf;
+        void *_mb; // the memory mapped view
+        int fileNo;
+    };
+
+    class DataFileMgr {
+        friend class BasicCursor;
+    public:
+        void init(const string& path );
+
+        /* see if we can find an extent of the right size in the freelist. */
+        static Extent* allocFromFreeList(const char *ns, int approxSize, bool capped = false);
+
+        /** @return DiskLoc where item ends up */
+        // changedId should be initialized to false
+        const DiskLoc updateRecord(
+            const char *ns,
+            NamespaceDetails *d,
+            NamespaceDetailsTransient *nsdt,
+            Record *toupdate, const DiskLoc& dl,
+            const char *buf, int len, OpDebug& debug, bool god=false);
+
+        // The object o may be updated if modified on insert.
+        void insertAndLog( const char *ns, const BSONObj &o, bool god = false );
+
+        /** insert will add an _id to the object if not present.  if you would like to see the final object
+            after such an addition, use this method.
+            @param o both and in and out param 
+            */
+        DiskLoc insertWithObjMod(const char *ns, BSONObj & /*out*/o, bool god = false);
+
+        /** @param obj in value only for this version. */
+        void insertNoReturnVal(const char *ns, BSONObj o, bool god = false);
+
+        DiskLoc insert(const char *ns, const void *buf, int len, bool god = false, bool mayAddIndex = true, bool *addedID = 0);
+        static shared_ptr<Cursor> findAll(const char *ns, const DiskLoc &startLoc = DiskLoc());
+
+        /* special version of insert for transaction logging -- streamlined a bit.
+           assumes ns is capped and no indexes
+           no _id field check
+        */
+        Record* fast_oplog_insert(NamespaceDetails *d, const char *ns, int len);
+
+        static Extent* getExtent(const DiskLoc& dl);
+        static Record* getRecord(const DiskLoc& dl);
+        static DeletedRecord* makeDeletedRecord(const DiskLoc& dl, int len);
+
+        void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false, bool logOp=false);
+
+        /* does not clean up indexes, etc. : just deletes the record in the pdfile. use deleteRecord() to unindex */
+        void _deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl);
+
+    private:
+        vector<MongoDataFile *> files;
+    };
+
+    extern DataFileMgr theDataFileMgr;
+
+#pragma pack(1)
+
+    class DeletedRecord {
+    public:
+        int lengthWithHeaders;
+        int extentOfs;
+        DiskLoc nextDeleted;
+        DiskLoc myExtentLoc(const DiskLoc& myLoc) const {
+            return DiskLoc(myLoc.a(), extentOfs);
+        }
+        Extent* myExtent(const DiskLoc& myLoc) {
+            return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs));
+        }
+    };
+
+    /* Record is a record in a datafile.  DeletedRecord is similar but for deleted space.
+
+    *11:03:20 AM) dm10gen: regarding extentOfs...
+    (11:03:42 AM) dm10gen: an extent is a continugous disk area, which contains many Records and DeleteRecords
+    (11:03:56 AM) dm10gen: a DiskLoc has two pieces, the fileno and ofs.  (64 bit total)
+    (11:04:16 AM) dm10gen: to keep the headesr small, instead of storing a 64 bit ptr to the full extent address, we keep just the offset
+    (11:04:29 AM) dm10gen: we can do this as we know the record's address, and it has the same fileNo
+    (11:04:33 AM) dm10gen: see class DiskLoc for more info
+    (11:04:43 AM) dm10gen: so that is how Record::myExtent() works
+    (11:04:53 AM) dm10gen: on an alloc(), when we build a new Record, we must populate its extentOfs then
+    */
+    class Record {
+    public:
+        enum HeaderSizeValue { HeaderSize = 16 };
+        int lengthWithHeaders;
+        int extentOfs;
+        int nextOfs;
+        int prevOfs;
+
+        /** be careful when referencing this that your write intent was correct */
+        char data[4];
+
+        int netLength() {
+            return lengthWithHeaders - HeaderSize;
+        }
+        //void setNewLength(int netlen) { lengthWithHeaders = netlen + HeaderSize; }
+
+        /* use this when a record is deleted. basically a union with next/prev fields */
+        DeletedRecord& asDeleted() { return *((DeletedRecord*) this); }
+
+        Extent* myExtent(const DiskLoc& myLoc) { return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs)); }
+
+        /* get the next record in the namespace, traversing extents as necessary */
+        DiskLoc getNext(const DiskLoc& myLoc);
+        DiskLoc getPrev(const DiskLoc& myLoc);
+
+        DiskLoc nextInExtent(const DiskLoc& myLoc) { 
+            if ( nextOfs == DiskLoc::NullOfs )
+                return DiskLoc();
+            assert( nextOfs );
+            return DiskLoc(myLoc.a(), nextOfs);
+        }
+
+        struct NP {
+            int nextOfs;
+            int prevOfs;
+        };
+        NP* np() { return (NP*) &nextOfs; }
+
+        // ---------------------
+        // memory cache
+        // ---------------------
+
+        /** 
+         * touches the data so that is in physical memory
+         * @param entireRecrd if false, only the header and first byte is touched
+         *                    if true, the entire record is touched
+         * */
+        void touch( bool entireRecrd = false );
+
+        /**
+         * @return if this record is likely in physical memory
+         *         its not guaranteed because its possible it gets swapped out in a very unlucky windows
+         */
+        bool likelyInPhysicalMemory();
+
+        /**
+         * tell the cache this Record was accessed
+         * @return this, for simple chaining
+         */
+        Record* accessed();
+
+        static bool MemoryTrackingEnabled;
+    };
+
+    /* extents are datafile regions where all the records within the region
+       belong to the same namespace.
+
+    (11:12:35 AM) dm10gen: when the extent is allocated, all its empty space is stuck into one big DeletedRecord
+    (11:12:55 AM) dm10gen: and that is placed on the free list
+    */
+    class Extent {
+    public:
+        unsigned magic;
+        DiskLoc myLoc;
+        DiskLoc xnext, xprev; /* next/prev extent for this namespace */
+
+        /* which namespace this extent is for.  this is just for troubleshooting really
+           and won't even be correct if the collection were renamed!
+        */
+        Namespace nsDiagnostic;
+
+        int length;   /* size of the extent, including these fields */
+        DiskLoc firstRecord;
+        DiskLoc lastRecord;
+        char _extentData[4];
+
+        static int HeaderSize() { return sizeof(Extent)-4; }
+
+        bool validates() {
+            return !(firstRecord.isNull() ^ lastRecord.isNull()) &&
+                   length >= 0 && !myLoc.isNull();
+        }
+
+        BSONObj dump() {
+            return BSON( "loc" << myLoc.toString() << "xnext" << xnext.toString() << "xprev" << xprev.toString()
+                      << "nsdiag" << nsDiagnostic.toString()
+                      << "size" << length << "firstRecord" << firstRecord.toString() << "lastRecord" << lastRecord.toString());
+        }
+
+        void dump(iostream& s) {
+            s << "    loc:" << myLoc.toString() << " xnext:" << xnext.toString() << " xprev:" << xprev.toString() << '\n';
+            s << "    nsdiag:" << nsDiagnostic.toString() << '\n';
+            s << "    size:" << length << " firstRecord:" << firstRecord.toString() << " lastRecord:" << lastRecord.toString() << '\n';
+        }
+
+        /* assumes already zeroed -- insufficient for block 'reuse' perhaps
+        Returns a DeletedRecord location which is the data in the extent ready for us.
+        Caller will need to add that to the freelist structure in namespacedetail.
+        */
+        DiskLoc init(const char *nsname, int _length, int _fileNo, int _offset, bool capped);
+
+        /* like init(), but for a reuse case */
+        DiskLoc reuse(const char *nsname, bool newUseIsAsCapped);
+
+        bool isOk() const { return magic == 0x41424344; }
+        void assertOk() const { assert(isOk()); }
+
+        Record* newRecord(int len);
+
+        Record* getRecord(DiskLoc dl) {
+            assert( !dl.isNull() );
+            assert( dl.sameFile(myLoc) );
+            int x = dl.getOfs() - myLoc.getOfs();
+            assert( x > 0 );
+            return (Record *) (((char *) this) + x);
+        }
+
+        Extent* getNextExtent() { return xnext.isNull() ? 0 : DataFileMgr::getExtent(xnext); }
+        Extent* getPrevExtent() { return xprev.isNull() ? 0 : DataFileMgr::getExtent(xprev); }
+
+        static int maxSize();
+        static int minSize() { return 0x100; }
+        /**
+         * @param len lengt of record we need
+         * @param lastRecord size of last extent which is a factor in next extent size
+         */
+        static int followupSize(int len, int lastExtentLen);
+
+        /** get a suggested size for the first extent in a namespace
+         *  @param len length of record we need to insert
+         */
+        static int initialSize(int len);
+
+        struct FL {
+            DiskLoc firstRecord;
+            DiskLoc lastRecord;
+        };
+        /** often we want to update just the firstRecord and lastRecord fields.
+            this helper is for that -- for use with getDur().writing() method
+        */
+        FL* fl() { return (FL*) &firstRecord; }
+
+        /** caller must declare write intent first */
+        void markEmpty();
+    private:
+        DiskLoc _reuse(const char *nsname, bool newUseIsAsCapped); // recycle an extent and reuse it for a different ns
+    };
+
+    /*  a datafile - i.e. the "dbname.<#>" files :
+
+          ----------------------
+          DataFileHeader
+          ----------------------
+          Extent (for a particular namespace)
+            Record
+            ...
+            Record (some chained for unused space)
+          ----------------------
+          more Extents...
+          ----------------------
+    */
+    class DataFileHeader {
+    public:
+        int version;
+        int versionMinor;
+        int fileLength;
+        DiskLoc unused; /* unused is the portion of the file that doesn't belong to any allocated extents. -1 = no more */
+        int unusedLength;
+        char reserved[8192 - 4*4 - 8];
+
+        char data[4]; // first extent starts here
+
+        enum { HeaderSize = 8192 };
+
+        bool isCurrentVersion() const { return ( version == PDFILE_VERSION ) && ( versionMinor == PDFILE_VERSION_MINOR ); }
+
+        bool uninitialized() const { return version == 0; }
+
+        void init(int fileno, int filelength, const char* filename) {
+            if ( uninitialized() ) {
+                DEV log() << "datafileheader::init initializing " << filename << " n:" << fileno << endl;
+                if( !(filelength > 32768 ) ) { 
+                    massert(13640, str::stream() << "DataFileHeader looks corrupt at file open filelength:" << filelength << " fileno:" << fileno, false);
+                }
+
+                { 
+                    if( !d.dbMutex.isWriteLocked() ) { 
+                        log() << "*** TEMP NOT INITIALIZING FILE " << filename << ", not in a write lock." << endl;
+                        log() << "temp bypass until more elaborate change - case that is manifesting is benign anyway" << endl;
+                        return;
+/**
+                        log() << "ERROR can't create outside a write lock" << endl;
+                        printStackTrace();
+                        ::abort();
+**/
+                    }
+                }
+
+                getDur().createdFile(filename, filelength);
+                assert( HeaderSize == 8192 );
+                DataFileHeader *h = getDur().writing(this);
+                h->fileLength = filelength;
+                h->version = PDFILE_VERSION;
+                h->versionMinor = PDFILE_VERSION_MINOR;
+                h->unused.set( fileno, HeaderSize );
+                assert( (data-(char*)this) == HeaderSize );
+                h->unusedLength = fileLength - HeaderSize - 16;
+            }
+        }
+
+        bool isEmpty() const {
+            return uninitialized() || ( unusedLength == fileLength - HeaderSize - 16 );
+        }
+    };
+
+#pragma pack()
+
+    inline Extent* MongoDataFile::_getExtent(DiskLoc loc) const {
+        loc.assertOk();
+        Extent *e = (Extent *) (p()+loc.getOfs());
+        return e;
+    }
+
+    inline Extent* MongoDataFile::getExtent(DiskLoc loc) const {
+        Extent *e = _getExtent(loc);
+        e->assertOk();
+        return e;
+    }
+
+} // namespace mongo
+
+#include "cursor.h"
+
+namespace mongo {
+
+    inline Record* MongoDataFile::recordAt(DiskLoc dl) {
+        int ofs = dl.getOfs();
+        if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path
+        return (Record*) (p()+ofs);
+    }
+
+    inline Record* MongoDataFile::makeRecord(DiskLoc dl, int size) {
+        int ofs = dl.getOfs();
+        if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path
+        return (Record*) (p()+ofs);
+    }
+
+    inline DiskLoc Record::getNext(const DiskLoc& myLoc) {
+        if ( nextOfs != DiskLoc::NullOfs ) {
+            /* defensive */
+            if ( nextOfs >= 0 && nextOfs < 10 ) {
+                sayDbContext("Assertion failure - Record::getNext() referencing a deleted record?");
+                return DiskLoc();
+            }
+
+            return DiskLoc(myLoc.a(), nextOfs);
+        }
+        Extent *e = myExtent(myLoc);
+        while ( 1 ) {
+            if ( e->xnext.isNull() )
+                return DiskLoc(); // end of table.
+            e = e->xnext.ext();
+            if ( !e->firstRecord.isNull() )
+                break;
+            // entire extent could be empty, keep looking
+        }
+        return e->firstRecord;
+    }
+    inline DiskLoc Record::getPrev(const DiskLoc& myLoc) {
+        if ( prevOfs != DiskLoc::NullOfs )
+            return DiskLoc(myLoc.a(), prevOfs);
+        Extent *e = myExtent(myLoc);
+        if ( e->xprev.isNull() )
+            return DiskLoc();
+        return e->xprev.ext()->lastRecord;
+    }
+
+    inline BSONObj DiskLoc::obj() const {
+        return BSONObj(rec()->accessed());
+    }
+    inline DeletedRecord* DiskLoc::drec() const {
+        assert( _a != -1 );
+        return (DeletedRecord*) rec();
+    }
+    inline Extent* DiskLoc::ext() const {
+        return DataFileMgr::getExtent(*this);
+    }
+
+    template< class V >
+    inline 
+    const BtreeBucket<V> * DiskLoc::btree() const {
+        assert( _a != -1 );
+        return (const BtreeBucket<V> *) rec()->data;
+    }
+
+} // namespace mongo
+
+#include "database.h"
+
+namespace mongo {
+
+    boost::intmax_t dbSize( const char *database );
+
+    inline NamespaceIndex* nsindex(const char *ns) {
+        Database *database = cc().database();
+        assert( database );
+        DEV {
+            char buf[256];
+            nsToDatabase(ns, buf);
+            if ( database->name != buf ) {
+                out() << "ERROR: attempt to write to wrong database\n";
+                out() << " ns:" << ns << '\n';
+                out() << " database->name:" << database->name << endl;
+                assert( database->name == buf );
+            }
+        }
+        return &database->namespaceIndex;
+    }
+
+    inline NamespaceDetails* nsdetails(const char *ns) {
+        // if this faults, did you set the current db first?  (Client::Context + dblock)
+        return nsindex(ns)->details(ns);
+    }
+
+    inline Extent* DataFileMgr::getExtent(const DiskLoc& dl) {
+        assert( dl.a() != -1 );
+        return cc().database()->getFile(dl.a())->getExtent(dl);
+    }
+
+    inline Record* DataFileMgr::getRecord(const DiskLoc& dl) {
+        assert( dl.a() != -1 );
+        return cc().database()->getFile(dl.a())->recordAt(dl);
+    }
+
+    BOOST_STATIC_ASSERT( 16 == sizeof(DeletedRecord) );
+
+    inline DeletedRecord* DataFileMgr::makeDeletedRecord(const DiskLoc& dl, int len) {
+        assert( dl.a() != -1 );
+        return (DeletedRecord*) cc().database()->getFile(dl.a())->makeRecord(dl, sizeof(DeletedRecord));
+    }
+
+    void ensureHaveIdIndex(const char *ns);
+
+    bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool maydeleteIdIndex );
+
+    inline BSONObj::BSONObj(const Record *r) {
+        init(r->data);
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/pipeline/accumulator.cpp b/src/mongo/db/pipeline/accumulator.cpp
new file mode 100755
index 00000000000..9ef8aa39470
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator.cpp
@@ -0,0 +1,92 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/accumulator.h"
+
+#include "db/jsobj.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+    using namespace mongoutils;
+
+    void Accumulator::addOperand(
+        const intrusive_ptr<Expression> &pExpression) {
+	uassert(15943, str::stream() << "group accumulator " <<
+		getOpName() << " only accepts one operand",
+		vpOperand.size() < 1);
+	
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    Accumulator::Accumulator():
+        ExpressionNary() {
+    }
+
+    void Accumulator::opToBson(
+	BSONObjBuilder *pBuilder, string opName,
+	string fieldName, unsigned depth) const {
+	assert(vpOperand.size() == 1);
+	BSONObjBuilder builder;
+	vpOperand[0]->addToBsonObj(&builder, opName, depth);
+	pBuilder->append(fieldName, builder.done());
+    }
+
+    void Accumulator::addToBsonObj(
+	BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+	opToBson(pBuilder, getOpName(), fieldName, depth);
+    }
+
+    void Accumulator::addToBsonArray(
+	BSONArrayBuilder *pBuilder, unsigned depth) const {
+	assert(false); // these can't appear in arrays
+    }
+
+    void agg_framework_reservedErrors() {
+	uassert(16017, "reserved error", false);
+	uassert(16018, "reserved error", false);
+	uassert(16019, "reserved error", false);
+	uassert(16020, "reserved error", false);
+	uassert(16021, "reserved error", false);
+	uassert(16022, "reserved error", false);
+	uassert(16023, "reserved error", false);
+	uassert(16024, "reserved error", false);
+	uassert(16025, "reserved error", false);
+	uassert(16026, "reserved error", false);
+	uassert(16027, "reserved error", false);
+	uassert(16028, "reserved error", false);
+	uassert(16029, "reserved error", false);
+	uassert(16030, "reserved error", false);
+	uassert(16031, "reserved error", false);
+	uassert(16032, "reserved error", false);
+	uassert(16033, "reserved error", false);
+
+	uassert(16036, "reserved error", false);
+	uassert(16037, "reserved error", false);
+	uassert(16038, "reserved error", false);
+	uassert(16039, "reserved error", false);
+	uassert(16040, "reserved error", false);
+	uassert(16041, "reserved error", false);
+	uassert(16042, "reserved error", false);
+	uassert(16043, "reserved error", false);
+	uassert(16044, "reserved error", false);
+	uassert(16045, "reserved error", false);
+	uassert(16046, "reserved error", false);
+	uassert(16047, "reserved error", false);
+	uassert(16048, "reserved error", false);
+	uassert(16049, "reserved error", false);
+    }
+}
diff --git a/src/mongo/db/pipeline/accumulator.h b/src/mongo/db/pipeline/accumulator.h
new file mode 100755
index 00000000000..a75b2c9abaa
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator.h
@@ -0,0 +1,259 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include <boost/unordered_set.hpp>
+#include "db/pipeline/value.h"
+#include "db/pipeline/expression.h"
+#include "bson/bsontypes.h"
+
+namespace mongo {
+    class ExpressionContext;
+
+    class Accumulator :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+	virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+	virtual void addToBsonArray(
+	    BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+        /*
+          Get the accumulated value.
+
+          @returns the accumulated value
+         */
+        virtual intrusive_ptr<const Value> getValue() const = 0;
+
+    protected:
+        Accumulator();
+
+	/*
+	  Convenience method for doing this for accumulators.  The pattern
+	  is always the same, so a common implementation works, but requires
+	  knowing the operator name.
+
+	  @param pBuilder the builder to add to
+	  @param fieldName the projected name
+	  @param opName the operator name
+	 */
+	void opToBson(
+	    BSONObjBuilder *pBuilder, string fieldName, string opName,
+	    unsigned depth) const;
+    };
+
+
+    class AccumulatorAddToSet :
+        public Accumulator {
+    public:
+        // virtuals from Expression
+	virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+        virtual intrusive_ptr<const Value> getValue() const;
+	virtual const char *getOpName() const;
+
+        /*
+          Create an appending accumulator.
+
+	  @param pCtx the expression context
+          @returns the created accumulator
+         */
+        static intrusive_ptr<Accumulator> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+    private:
+        AccumulatorAddToSet(const intrusive_ptr<ExpressionContext> &pTheCtx);
+        typedef boost::unordered_set<intrusive_ptr<const Value>, Value::Hash > SetType;
+        mutable SetType set;
+        mutable SetType::iterator itr; 
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+
+    /*
+      This isn't a finished accumulator, but rather a convenient base class
+      for others such as $first, $last, $max, $min, and similar.  It just
+      provides a holder for a single Value, and the getter for that.  The
+      holder is protected so derived classes can manipulate it.
+     */
+    class AccumulatorSingleValue :
+        public Accumulator {
+    public:
+        // virtuals from Expression
+        virtual intrusive_ptr<const Value> getValue() const;
+
+    protected:
+        AccumulatorSingleValue();
+
+        mutable intrusive_ptr<const Value> pValue; /* current min/max */
+    };
+
+
+    class AccumulatorFirst :
+        public AccumulatorSingleValue {
+    public:
+        // virtuals from Expression
+	virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+
+        /*
+          Create the accumulator.
+
+          @returns the created accumulator
+         */
+        static intrusive_ptr<Accumulator> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+    private:
+        AccumulatorFirst();
+    };
+
+
+    class AccumulatorLast :
+        public AccumulatorSingleValue {
+    public:
+        // virtuals from Expression
+	virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+
+        /*
+          Create the accumulator.
+
+          @returns the created accumulator
+         */
+        static intrusive_ptr<Accumulator> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+    private:
+        AccumulatorLast();
+    };
+
+
+    class AccumulatorSum :
+        public Accumulator {
+    public:
+        // virtuals from Accumulator
+	virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+        virtual intrusive_ptr<const Value> getValue() const;
+	virtual const char *getOpName() const;
+
+        /*
+          Create a summing accumulator.
+
+	  @param pCtx the expression context
+          @returns the created accumulator
+         */
+        static intrusive_ptr<Accumulator> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+    protected: /* reused by AccumulatorAvg */
+        AccumulatorSum();
+
+        mutable BSONType totalType;
+        mutable long long longTotal;
+        mutable double doubleTotal;
+    };
+
+
+    class AccumulatorMinMax :
+        public AccumulatorSingleValue {
+    public:
+        // virtuals from Expression
+	virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+
+        /*
+          Create either the max or min accumulator.
+
+          @returns the created accumulator
+         */
+        static intrusive_ptr<Accumulator> createMin(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+        static intrusive_ptr<Accumulator> createMax(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+    private:
+        AccumulatorMinMax(int theSense);
+
+        int sense; /* 1 for min, -1 for max; used to "scale" comparison */
+    };
+
+
+    class AccumulatorPush :
+        public Accumulator {
+    public:
+        // virtuals from Expression
+	virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+        virtual intrusive_ptr<const Value> getValue() const;
+	virtual const char *getOpName() const;
+
+        /*
+          Create an appending accumulator.
+
+	  @param pCtx the expression context
+          @returns the created accumulator
+         */
+        static intrusive_ptr<Accumulator> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+    private:
+        AccumulatorPush(const intrusive_ptr<ExpressionContext> &pTheCtx);
+
+        mutable vector<intrusive_ptr<const Value> > vpValue;
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+
+    class AccumulatorAvg :
+	public AccumulatorSum {
+        typedef AccumulatorSum Super;
+    public:
+        // virtuals from Accumulator
+	virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+        virtual intrusive_ptr<const Value> getValue() const;
+	virtual const char *getOpName() const;
+
+        /*
+          Create an averaging accumulator.
+
+	  @param pCtx the expression context
+          @returns the created accumulator
+         */
+        static intrusive_ptr<Accumulator> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+    private:
+	static const char subTotalName[];
+	static const char countName[];
+
+        AccumulatorAvg(const intrusive_ptr<ExpressionContext> &pCtx);
+
+	mutable long long count;
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+}
diff --git a/src/mongo/db/pipeline/accumulator_add_to_set.cpp b/src/mongo/db/pipeline/accumulator_add_to_set.cpp
new file mode 100755
index 00000000000..94df0293de4
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_add_to_set.cpp
@@ -0,0 +1,79 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+    intrusive_ptr<const Value> AccumulatorAddToSet::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        assert(vpOperand.size() == 1);
+	intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+	if (prhs->getType() == Undefined)
+	    ; /* nothing to add to the array */
+	else if (!pCtx->getInRouter())
+	    set.insert(prhs);
+	else {
+	    /*
+	      If we're in the router, we need to take apart the arrays we
+	      receive and put their elements into the array we are collecting.
+	      If we didn't, then we'd get an array of arrays, with one array
+	      from each shard that responds.
+	     */
+	    assert(prhs->getType() == Array);
+	    
+	    intrusive_ptr<ValueIterator> pvi(prhs->getArray());
+	    while(pvi->more()) {
+		intrusive_ptr<const Value> pElement(pvi->next());
+		set.insert(pElement);
+	    }
+	}
+
+        return Value::getNull();
+    }
+
+    intrusive_ptr<const Value> AccumulatorAddToSet::getValue() const {
+        vector<intrusive_ptr<const Value> > valVec;
+
+        for (itr = set.begin(); itr != set.end(); ++itr) {
+            valVec.push_back(*itr);
+        }
+        /* there is no issue of scope since createArray copy constructs */
+        return Value::createArray(valVec);
+    }
+
+    AccumulatorAddToSet::AccumulatorAddToSet(
+	const intrusive_ptr<ExpressionContext> &pTheCtx):
+        Accumulator(),
+        set(),
+        pCtx(pTheCtx) {
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorAddToSet::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorAddToSet> pAccumulator(
+	    new AccumulatorAddToSet(pCtx));
+        return pAccumulator;
+    }
+
+    const char *AccumulatorAddToSet::getOpName() const {
+	return "$addToSet";
+    }
+}
diff --git a/src/mongo/db/pipeline/accumulator_avg.cpp b/src/mongo/db/pipeline/accumulator_avg.cpp
new file mode 100755
index 00000000000..9f18b1820c8
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_avg.cpp
@@ -0,0 +1,123 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    const char AccumulatorAvg::subTotalName[] = "subTotal";
+    const char AccumulatorAvg::countName[] = "count";
+
+    intrusive_ptr<const Value> AccumulatorAvg::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	if (!pCtx->getInRouter()) {
+	    Super::evaluate(pDocument);
+	    ++count;
+	}
+	else {
+	    /*
+	      If we're in the router, we expect an object that contains
+	      both a subtotal and a count.  This is what getValue() produced
+	      below.
+	     */
+	    intrusive_ptr<const Value> prhs(
+		vpOperand[0]->evaluate(pDocument));
+	    assert(prhs->getType() == Object);
+	    intrusive_ptr<Document> pShardDoc(prhs->getDocument());
+
+	    intrusive_ptr<const Value> pSubTotal(
+		pShardDoc->getValue(subTotalName));
+	    assert(pSubTotal.get());
+	    BSONType subTotalType = pSubTotal->getType();
+	    if ((totalType == NumberLong) || (subTotalType == NumberLong))
+		totalType = NumberLong;
+	    if ((totalType == NumberDouble) || (subTotalType == NumberDouble))
+		totalType = NumberDouble;
+
+	    if (subTotalType == NumberInt) {
+		int v = pSubTotal->getInt();
+		longTotal += v;
+		doubleTotal += v;
+	    }
+	    else if (subTotalType == NumberLong) {
+		long long v = pSubTotal->getLong();
+		longTotal += v;
+		doubleTotal += v;
+	    }
+	    else {
+		double v = pSubTotal->getDouble();
+		doubleTotal += v;
+	    }
+		
+	    intrusive_ptr<const Value> pCount(pShardDoc->getValue(countName));
+	    count += pCount->getLong();
+	}
+
+        return Value::getZero();
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorAvg::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorAvg> pA(new AccumulatorAvg(pCtx));
+        return pA;
+    }
+
+    intrusive_ptr<const Value> AccumulatorAvg::getValue() const {
+	if (!pCtx->getInShard()) {
+	    double avg = 0;
+	    if (count) {
+		if (totalType != NumberDouble)
+		    avg = static_cast<double>(longTotal / count);
+		else
+		    avg = doubleTotal / count;
+	    }
+
+	    return Value::createDouble(avg);
+	}
+
+	intrusive_ptr<Document> pDocument(Document::create());
+
+	intrusive_ptr<const Value> pSubTotal;
+	if (totalType == NumberInt)
+	    pSubTotal = Value::createInt((int)longTotal);
+	else if (totalType == NumberLong)
+	    pSubTotal = Value::createLong(longTotal);
+	else
+	    pSubTotal = Value::createDouble(doubleTotal);
+	pDocument->addField(subTotalName, pSubTotal);
+
+	intrusive_ptr<const Value> pCount(Value::createLong(count));
+	pDocument->addField(countName, pCount);
+
+	return Value::createDocument(pDocument);
+    }
+
+    AccumulatorAvg::AccumulatorAvg(
+	const intrusive_ptr<ExpressionContext> &pTheCtx):
+        AccumulatorSum(),
+	count(0),
+	pCtx(pTheCtx) {
+    }
+
+    const char *AccumulatorAvg::getOpName() const {
+	return "$avg";
+    }
+}
diff --git a/src/mongo/db/pipeline/accumulator_first.cpp b/src/mongo/db/pipeline/accumulator_first.cpp
new file mode 100755
index 00000000000..c947aa83996
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_first.cpp
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    intrusive_ptr<const Value> AccumulatorFirst::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        assert(vpOperand.size() == 1);
+
+	/* only remember the first value seen */
+	if (!pValue.get())
+	    pValue = vpOperand[0]->evaluate(pDocument);
+
+        return pValue;
+    }
+
+    AccumulatorFirst::AccumulatorFirst():
+	AccumulatorSingleValue() {
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorFirst::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorFirst> pAccumulator(
+	    new AccumulatorFirst());
+        return pAccumulator;
+    }
+
+    const char *AccumulatorFirst::getOpName() const {
+	return "$first";
+    }
+}
diff --git a/src/mongo/db/pipeline/accumulator_last.cpp b/src/mongo/db/pipeline/accumulator_last.cpp
new file mode 100755
index 00000000000..c134fc83159
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_last.cpp
@@ -0,0 +1,48 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    intrusive_ptr<const Value> AccumulatorLast::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        assert(vpOperand.size() == 1);
+
+	/* always remember the last value seen */
+	pValue = vpOperand[0]->evaluate(pDocument);
+
+        return pValue;
+    }
+
+    AccumulatorLast::AccumulatorLast():
+	AccumulatorSingleValue() {
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorLast::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorLast> pAccumulator(
+	    new AccumulatorLast());
+        return pAccumulator;
+    }
+
+    const char *AccumulatorLast::getOpName() const {
+	return "$last";
+    }
+}
diff --git a/src/mongo/db/pipeline/accumulator_min_max.cpp b/src/mongo/db/pipeline/accumulator_min_max.cpp
new file mode 100755
index 00000000000..6f078187b44
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_min_max.cpp
@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    intrusive_ptr<const Value> AccumulatorMinMax::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        assert(vpOperand.size() == 1);
+	intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+        /* if this is the first value, just use it */
+        if (!pValue.get())
+            pValue = prhs;
+        else {
+            /* compare with the current value; swap if appropriate */
+            int cmp = Value::compare(pValue, prhs) * sense;
+            if (cmp > 0)
+                pValue = prhs;
+        }
+
+        return pValue;
+    }
+
+    AccumulatorMinMax::AccumulatorMinMax(int theSense):
+	AccumulatorSingleValue(),
+        sense(theSense) {
+        assert((sense == 1) || (sense == -1));
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorMinMax::createMin(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorMinMax> pAccumulator(
+	    new AccumulatorMinMax(1));
+        return pAccumulator;
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorMinMax::createMax(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorMinMax> pAccumulator(
+	    new AccumulatorMinMax(-1));
+        return pAccumulator;
+    }
+
+    const char *AccumulatorMinMax::getOpName() const {
+	if (sense == 1)
+	    return "$min";
+	return "$max";
+    }
+}
diff --git a/src/mongo/db/pipeline/accumulator_push.cpp b/src/mongo/db/pipeline/accumulator_push.cpp
new file mode 100755
index 00000000000..2640bc4ecfd
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_push.cpp
@@ -0,0 +1,73 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+    intrusive_ptr<const Value> AccumulatorPush::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        assert(vpOperand.size() == 1);
+	intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+	if (prhs->getType() == Undefined)
+	    ; /* nothing to add to the array */
+	else if (!pCtx->getInRouter())
+	    vpValue.push_back(prhs);
+	else {
+	    /*
+	      If we're in the router, we need to take apart the arrays we
+	      receive and put their elements into the array we are collecting.
+	      If we didn't, then we'd get an array of arrays, with one array
+	      from each shard that responds.
+	     */
+	    assert(prhs->getType() == Array);
+	    
+	    intrusive_ptr<ValueIterator> pvi(prhs->getArray());
+	    while(pvi->more()) {
+		intrusive_ptr<const Value> pElement(pvi->next());
+		vpValue.push_back(pElement);
+	    }
+	}
+
+        return Value::getNull();
+    }
+
+    intrusive_ptr<const Value> AccumulatorPush::getValue() const {
+        return Value::createArray(vpValue);
+    }
+
+    AccumulatorPush::AccumulatorPush(
+	const intrusive_ptr<ExpressionContext> &pTheCtx):
+        Accumulator(),
+        vpValue(),
+        pCtx(pTheCtx) {
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorPush::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorPush> pAccumulator(
+	    new AccumulatorPush(pCtx));
+        return pAccumulator;
+    }
+
+    const char *AccumulatorPush::getOpName() const {
+	return "$push";
+    }
+}
diff --git a/src/mongo/db/pipeline/accumulator_single_value.cpp b/src/mongo/db/pipeline/accumulator_single_value.cpp
new file mode 100755
index 00000000000..bfec80387d3
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_single_value.cpp
@@ -0,0 +1,32 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    intrusive_ptr<const Value> AccumulatorSingleValue::getValue() const {
+	return pValue;
+    }
+
+    AccumulatorSingleValue::AccumulatorSingleValue():
+	pValue(intrusive_ptr<const Value>()) {
+    }
+
+}
diff --git a/src/mongo/db/pipeline/accumulator_sum.cpp b/src/mongo/db/pipeline/accumulator_sum.cpp
new file mode 100755
index 00000000000..e6526ac254a
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_sum.cpp
@@ -0,0 +1,74 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    intrusive_ptr<const Value> AccumulatorSum::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        assert(vpOperand.size() == 1);
+	intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+        /* upgrade to the widest type required to hold the result */
+	totalType = Value::getWidestNumeric(totalType, prhs->getType());
+
+        if (totalType == NumberInt) {
+            int v = prhs->coerceToInt();
+            longTotal += v;
+            doubleTotal += v;
+        }
+        else if (totalType == NumberLong) {
+            long long v = prhs->coerceToLong();
+            longTotal += v;
+            doubleTotal += v;
+        }
+        else { /* (totalType == NumberDouble) */
+            double v = prhs->coerceToDouble();
+            doubleTotal += v;
+        }
+
+        return Value::getZero();
+    }
+
+    intrusive_ptr<Accumulator> AccumulatorSum::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	intrusive_ptr<AccumulatorSum> pSummer(new AccumulatorSum());
+        return pSummer;
+    }
+
+    intrusive_ptr<const Value> AccumulatorSum::getValue() const {
+        if (totalType == NumberInt)
+            return Value::createInt((int)longTotal);
+        if (totalType == NumberLong)
+            return Value::createLong(longTotal);
+        return Value::createDouble(doubleTotal);
+    }
+
+    AccumulatorSum::AccumulatorSum():
+        Accumulator(),
+        totalType(NumberInt),
+        longTotal(0),
+        doubleTotal(0) {
+    }
+
+    const char *AccumulatorSum::getOpName() const {
+	return "$sum";
+    }
+}
diff --git a/src/mongo/db/pipeline/builder.cpp b/src/mongo/db/pipeline/builder.cpp
new file mode 100755
index 00000000000..cbde3705656
--- /dev/null
+++ b/src/mongo/db/pipeline/builder.cpp
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/builder.h"
+
+
+namespace mongo {
+
+    void BuilderObj::append() {
+	pBuilder->appendNull(fieldName);
+    }
+
+    void BuilderObj::append(bool b) {
+	pBuilder->append(fieldName, b);
+    }
+
+    void BuilderObj::append(int i) {
+	pBuilder->append(fieldName, i);
+    }
+
+    void BuilderObj::append(long long ll) {
+	pBuilder->append(fieldName, ll);
+    }
+
+    void BuilderObj::append(double d) {
+	pBuilder->append(fieldName, d);
+    }
+
+    void BuilderObj::append(string s) {
+	pBuilder->append(fieldName, s);
+    }
+
+    void BuilderObj::append(const OID &o) {
+	pBuilder->append(fieldName, o);
+    }
+
+    void BuilderObj::append(const Date_t &d) {
+	pBuilder->append(fieldName, d);
+    }
+
+    void BuilderObj::append(BSONObjBuilder *pDone) {
+	pBuilder->append(fieldName, pDone->done());
+    }
+
+    void BuilderObj::append(BSONArrayBuilder *pDone) {
+	pBuilder->append(fieldName, pDone->arr());
+    }
+
+    BuilderObj::BuilderObj(
+	BSONObjBuilder *pObjBuilder, string theFieldName):
+        pBuilder(pObjBuilder),
+        fieldName(theFieldName) {
+    }
+
+
+    void BuilderArray::append() {
+	pBuilder->appendNull();
+    }
+
+    void BuilderArray::append(bool b) {
+	pBuilder->append(b);
+    }
+
+    void BuilderArray::append(int i) {
+	pBuilder->append(i);
+    }
+
+    void BuilderArray::append(long long ll) {
+	pBuilder->append(ll);
+    }
+
+    void BuilderArray::append(double d) {
+	pBuilder->append(d);
+    }
+
+    void BuilderArray::append(string s) {
+	pBuilder->append(s);
+    }
+
+    void BuilderArray::append(const OID &o) {
+	pBuilder->append(o);
+    }
+
+    void BuilderArray::append(const Date_t &d) {
+	pBuilder->append(d);
+    }
+
+    void BuilderArray::append(BSONObjBuilder *pDone) {
+	pBuilder->append(pDone->done());
+    }
+
+    void BuilderArray::append(BSONArrayBuilder *pDone) {
+	pBuilder->append(pDone->arr());
+    }
+
+    BuilderArray::BuilderArray(
+	BSONArrayBuilder *pArrayBuilder):
+        pBuilder(pArrayBuilder) {
+    }
+
+}
diff --git a/src/mongo/db/pipeline/builder.h b/src/mongo/db/pipeline/builder.h
new file mode 100755
index 00000000000..bdf71cd784c
--- /dev/null
+++ b/src/mongo/db/pipeline/builder.h
@@ -0,0 +1,95 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+namespace mongo {
+
+    class BSONArrayBuilder;
+    class BSONObjBuilder;
+
+    /*
+      Generic Builder.
+
+      The methods to append items to an object (on BSONObjBuilder) and an array
+      (on BSONArrayBuilder) differ only by their inclusion of a field name. 
+      For more complicated implementations of addToBsonObj() and
+      addToBsonArray(), it makes sense to abstract that out and use
+      this generic builder that always looks the same, and then implement
+      addToBsonObj() and addToBsonArray() by using a common method.
+    */
+    class Builder :
+        boost::noncopyable {
+    public:
+	virtual ~Builder() {};
+
+        virtual void append() = 0; // append a null
+	virtual void append(bool b) = 0;
+	virtual void append(int i) = 0;
+	virtual void append(long long ll) = 0;
+	virtual void append(double d) = 0;
+	virtual void append(string s) = 0;
+	virtual void append(const OID &o) = 0;
+	virtual void append(const Date_t &d) = 0;
+	virtual void append(BSONObjBuilder *pDone) = 0;
+	virtual void append(BSONArrayBuilder *pDone) = 0;
+    };
+
+    class BuilderObj :
+	public Builder {
+    public:
+	// virtuals from Builder
+        virtual void append();
+	virtual void append(bool b);
+	virtual void append(int i);
+	virtual void append(long long ll);
+	virtual void append(double d);
+	virtual void append(string s);
+	virtual void append(const OID &o);
+	virtual void append(const Date_t &d);
+	virtual void append(BSONObjBuilder *pDone);
+	virtual void append(BSONArrayBuilder *pDone);
+
+	BuilderObj(BSONObjBuilder *pBuilder, string fieldName);
+
+    private:
+	BSONObjBuilder *pBuilder;
+	string fieldName;
+    };
+
+    class BuilderArray :
+	public Builder {
+    public:
+	// virtuals from Builder
+        virtual void append();
+	virtual void append(bool b);
+	virtual void append(int i);
+	virtual void append(long long ll);
+	virtual void append(double d);
+	virtual void append(string s);
+	virtual void append(const OID &o);
+	virtual void append(const Date_t &d);
+	virtual void append(BSONObjBuilder *pDone);
+	virtual void append(BSONArrayBuilder *pDone);
+
+	BuilderArray(BSONArrayBuilder *pBuilder);
+
+    private:
+	BSONArrayBuilder *pBuilder;
+    };
+}
diff --git a/src/mongo/db/pipeline/doc_mem_monitor.cpp b/src/mongo/db/pipeline/doc_mem_monitor.cpp
new file mode 100755
index 00000000000..ffbe9c88854
--- /dev/null
+++ b/src/mongo/db/pipeline/doc_mem_monitor.cpp
@@ -0,0 +1,68 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/doc_mem_monitor.h"
+#include "util/systeminfo.h"
+
+namespace mongo {
+
+    DocMemMonitor::DocMemMonitor(StringWriter *pW) {
+	/*
+	  Use the default values.
+
+	  Currently, we warn in log at 5%, and assert at 10%.
+	*/
+	size_t errorRam = SystemInfo::getPhysicalRam() / 10;
+	size_t warnRam = errorRam / 2;
+
+	init(pW, warnRam, errorRam);
+    }
+
+    DocMemMonitor::DocMemMonitor(StringWriter *pW,
+				 size_t warnLimit, size_t errorLimit) {
+	init(pW, warnLimit, errorLimit);
+    }
+
+    void DocMemMonitor::addToTotal(size_t amount) {
+	totalUsed += amount;
+
+	if (!warned) {
+	    if (warnLimit && (totalUsed > warnLimit)) {
+		stringstream ss;
+		ss << "warning, 5% of physical RAM used for ";
+		pWriter->writeString(ss);
+		ss << endl;
+		warning() << ss.str();
+		warned = true;
+	    }
+	}
+	
+	if (errorLimit) {
+	    uassert(15944, "terminating request:  request heap use exceeded 10% of physical RAM", (totalUsed <= errorLimit));
+	}
+    }
+
+    void DocMemMonitor::init(StringWriter *pW,
+			     size_t warnLimit, size_t errorLimit) {
+	this->pWriter = pW;
+	this->warnLimit = warnLimit;
+	this->errorLimit = errorLimit;
+
+	warned = false;
+	totalUsed = 0;
+    }
+}
diff --git a/src/mongo/db/pipeline/doc_mem_monitor.h b/src/mongo/db/pipeline/doc_mem_monitor.h
new file mode 100755
index 00000000000..e368acc906a
--- /dev/null
+++ b/src/mongo/db/pipeline/doc_mem_monitor.h
@@ -0,0 +1,94 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+#include "util/string_writer.h"
+
+
+namespace mongo {
+
+    /*
+      This utility class provides an easy way to total up, monitor, warn, and
+      signal an error when the amount of memory used for an operation exceeds
+      given thresholds.
+
+      Create a local instance of this class, and then inform it of any memory
+      that you consume using addToTotal().
+
+      Warnings or errors are issued as usage exceeds certain fractions of
+      physical memory on the host, as determined by SystemInfo.
+
+      This class is not guaranteed to warn or signal errors if the host system
+      does not support the ability to report its memory, as per the warnings
+      for SystemInfo in systeminfo.h.
+     */
+    class DocMemMonitor {
+    public:
+	/*
+	  Constructor.
+
+	  Uses default limits for warnings and errors.
+
+	  The StringWriter parameter must outlive the DocMemMonitor instance.
+
+	  @param pWriter string writer that provides information about the
+	      operation being monitored
+	 */
+	DocMemMonitor(StringWriter *pWriter);
+
+	/*
+	  Constructor.
+
+	  This variant allows explicit selection of the limits.  Note that
+	  limits of zero are treated as infinite.
+
+	  The StringWriter parameter must outlive the DocMemMonitor instance.
+
+	  @param pWriter string writer that provides information about the
+	      operation being monitored
+	  @param warnLimit the amount of ram to issue (log) a warning for
+	  @param errorLimit the amount of ram to throw an error for
+	 */
+	DocMemMonitor(StringWriter *pWriter, size_t warnLimit,
+		      size_t errorLimit);
+
+	/*
+	  Increment the total amount of memory used by the given amount.  If
+	  the warning threshold is exceeded, a warning will be logged.  If the
+	  error threshold is exceeded, an error will be thrown.
+
+	  @param amount the amount of memory to add to the current total
+	 */
+	void addToTotal(size_t amount);
+
+    private:
+	/*
+	  Real constructor body.
+
+	  Provides common construction for all the variant constructors.
+	 */
+	void init(StringWriter *pW, size_t warnLimit, size_t errorLimit);
+
+	bool warned;
+	size_t totalUsed;
+	size_t warnLimit;
+	size_t errorLimit;
+	StringWriter *pWriter;
+    };
+
+}
diff --git a/src/mongo/db/pipeline/document.cpp b/src/mongo/db/pipeline/document.cpp
new file mode 100755
index 00000000000..a49c7e303c1
--- /dev/null
+++ b/src/mongo/db/pipeline/document.cpp
@@ -0,0 +1,219 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/value.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+    using namespace mongoutils;
+
+    string Document::idName("_id");
+
+    intrusive_ptr<Document> Document::createFromBsonObj(BSONObj *pBsonObj) {
+	intrusive_ptr<Document> pDocument(new Document(pBsonObj));
+        return pDocument;
+    }
+
+    Document::Document(BSONObj *pBsonObj):
+        vFieldName(),
+        vpValue() {
+        BSONObjIterator bsonIterator(pBsonObj->begin());
+        while(bsonIterator.more()) {
+            BSONElement bsonElement(bsonIterator.next());
+            string fieldName(bsonElement.fieldName());
+	    intrusive_ptr<const Value> pValue(
+                Value::createFromBsonElement(&bsonElement));
+
+            vFieldName.push_back(fieldName);
+            vpValue.push_back(pValue);
+        }
+    }
+
+    void Document::toBson(BSONObjBuilder *pBuilder) {
+        const size_t n = vFieldName.size();
+        for(size_t i = 0; i < n; ++i)
+            vpValue[i]->addToBsonObj(pBuilder, vFieldName[i]);
+    }
+
+    intrusive_ptr<Document> Document::create(size_t sizeHint) {
+	intrusive_ptr<Document> pDocument(new Document(sizeHint));
+        return pDocument;
+    }
+
+    Document::Document(size_t sizeHint):
+        vFieldName(),
+        vpValue() {
+        if (sizeHint) {
+            vFieldName.reserve(sizeHint);
+            vpValue.reserve(sizeHint);
+        }
+    }
+
+    intrusive_ptr<Document> Document::clone() {
+        const size_t n = vFieldName.size();
+	intrusive_ptr<Document> pNew(Document::create(n));
+        for(size_t i = 0; i < n; ++i)
+            pNew->addField(vFieldName[i], vpValue[i]);
+
+        return pNew;
+    }
+
+    Document::~Document() {
+    }
+
+    FieldIterator *Document::createFieldIterator() {
+        return new FieldIterator(intrusive_ptr<Document>(this));
+    }
+
+    intrusive_ptr<const Value> Document::getValue(const string &fieldName) {
+        /*
+          For now, assume the number of fields is small enough that iteration
+          is ok.  Later, if this gets large, we can create a map into the
+          vector for these lookups.
+
+          Note that because of the schema-less nature of this data, we always
+          have to look, and can't assume that the requested field is always
+          in a particular place as we would with a statically compilable
+          reference.
+        */
+        const size_t n = vFieldName.size();
+        for(size_t i = 0; i < n; ++i) {
+	    if (fieldName.compare(vFieldName[i]) == 0)
+                return vpValue[i];
+        }
+
+        return(intrusive_ptr<const Value>());
+    }
+
+    void Document::addField(const string &fieldName,
+			    const intrusive_ptr<const Value> &pValue) {
+	uassert(15945, str::stream() << "cannot add undefined field " <<
+		fieldName << " to document", pValue->getType() != Undefined);
+
+        vFieldName.push_back(fieldName);
+        vpValue.push_back(pValue);
+    }
+
+    void Document::setField(size_t index,
+                            const string &fieldName,
+			    const intrusive_ptr<const Value> &pValue) {
+	/* special case:  should this field be removed? */
+	if (!pValue.get()) {
+	    vFieldName.erase(vFieldName.begin() + index);
+	    vpValue.erase(vpValue.begin() + index);
+	    return;
+	}
+
+	/* make sure we have a valid value */
+	uassert(15968, str::stream() << "cannot set undefined field " <<
+		fieldName << " to document", pValue->getType() != Undefined);
+
+	/* set the indicated field */
+        vFieldName[index] = fieldName;
+        vpValue[index] = pValue;
+    }
+
+    intrusive_ptr<const Value> Document::getField(const string &fieldName) const {
+	const size_t n = vFieldName.size();
+	for(size_t i = 0; i < n; ++i) {
+	    if (fieldName.compare(vFieldName[i]) == 0)
+		return vpValue[i];
+	}
+
+	/* if we got here, there's no such field */
+	return intrusive_ptr<const Value>();
+    }
+
+    size_t Document::getApproximateSize() const {
+	size_t size = sizeof(Document);
+	const size_t n = vpValue.size();
+	for(size_t i = 0; i < n; ++i)
+	    size += vpValue[i]->getApproximateSize();
+
+	return size;
+    }
+
+    size_t Document::getFieldIndex(const string &fieldName) const {
+	const size_t n = vFieldName.size();
+	size_t i = 0;
+	for(; i < n; ++i) {
+	    if (fieldName.compare(vFieldName[i]) == 0)
+		break;
+	}
+
+	return i;
+    }
+
+    void Document::hash_combine(size_t &seed) const {
+	const size_t n = vFieldName.size();
+	for(size_t i = 0; i < n; ++i) {
+	    boost::hash_combine(seed, vFieldName[i]);
+	    vpValue[i]->hash_combine(seed);
+	}
+    }
+
+    int Document::compare(const intrusive_ptr<Document> &rL,
+                          const intrusive_ptr<Document> &rR) {
+        const size_t lSize = rL->vFieldName.size();
+        const size_t rSize = rR->vFieldName.size();
+
+        for(size_t i = 0; true; ++i) {
+            if (i >= lSize) {
+                if (i >= rSize)
+                    return 0; // documents are the same length
+
+                return -1; // left document is shorter
+            }
+
+            if (i >= rSize)
+                return 1; // right document is shorter
+
+            const int nameCmp = rL->vFieldName[i].compare(rR->vFieldName[i]);
+            if (nameCmp)
+                return nameCmp; // field names are unequal
+
+            const int valueCmp = Value::compare(rL->vpValue[i], rR->vpValue[i]);
+            if (valueCmp)
+                return valueCmp; // fields are unequal
+        }
+
+        /* NOTREACHED */
+        assert(false);
+        return 0;
+    }
+
+    /* ----------------------- FieldIterator ------------------------------- */
+
+    FieldIterator::FieldIterator(const intrusive_ptr<Document> &pTheDocument):
+        pDocument(pTheDocument),
+        index(0) {
+    }
+
+    bool FieldIterator::more() const {
+        return (index < pDocument->vFieldName.size());
+    }
+
+    pair<string, intrusive_ptr<const Value> > FieldIterator::next() {
+        assert(more());
+        pair<string, intrusive_ptr<const Value> > result(
+            pDocument->vFieldName[index], pDocument->vpValue[index]);
+        ++index;
+        return result;
+    }
+}
diff --git a/src/mongo/db/pipeline/document.h b/src/mongo/db/pipeline/document.h
new file mode 100755
index 00000000000..f11a825151e
--- /dev/null
+++ b/src/mongo/db/pipeline/document.h
@@ -0,0 +1,246 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include "util/intrusive_counter.h"
+
+namespace mongo {
+    class BSONObj;
+    class FieldIterator;
+    class Value;
+
+    class Document :
+        public IntrusiveCounterUnsigned {
+    public:
+        ~Document();
+
+        /*
+          Create a new Document from the given BSONObj.
+
+          Document field values may be pointed to in the BSONObj, so it
+          must live at least as long as the resulting Document.
+
+          @returns shared pointer to the newly created Document
+        */
+        static intrusive_ptr<Document> createFromBsonObj(BSONObj *pBsonObj);
+
+        /*
+          Create a new empty Document.
+
+          @param sizeHint a hint at what the number of fields will be; if
+            known, this can be used to increase memory allocation efficiency
+          @returns shared pointer to the newly created Document
+        */
+        static intrusive_ptr<Document> create(size_t sizeHint = 0);
+
+        /*
+          Clone a document.
+
+          The new document shares all the fields' values with the original.
+
+	  This is not a deep copy.  Only the fields on the top-level document
+	  are cloned.
+
+	  @returns the shallow clone of the document
+        */
+        intrusive_ptr<Document> clone();
+
+        /*
+          Add this document to the BSONObj under construction with the
+          given BSONObjBuilder.
+        */
+        void toBson(BSONObjBuilder *pBsonObjBuilder);
+
+        /*
+          Create a new FieldIterator that can be used to examine the
+          Document's fields.
+        */
+        FieldIterator *createFieldIterator();
+
+        /*
+          Get the value of the specified field.
+
+          @param fieldName the name of the field
+          @return point to the requested field
+        */
+        intrusive_ptr<const Value> getValue(const string &fieldName);
+
+        /*
+          Add the given field to the Document.
+
+          BSON documents' fields are ordered; the new Field will be
+          appened to the current list of fields.
+
+          It is an error to add a field that has the same name as another
+          field.
+        */
+        void addField(const string &fieldName,
+		      const intrusive_ptr<const Value> &pValue);
+
+        /*
+          Set the given field to be at the specified position in the
+          Document.  This will replace any field that is currently in that
+          position.  The index must be within the current range of field
+          indices.
+
+	  pValue.get() may be NULL, in which case the field will be
+	  removed.  fieldName is ignored in this case.
+
+	  @param index the field index in the list of fields
+	  @param fieldName the new field name
+	  @param pValue the new Value
+        */
+        void setField(size_t index,
+                      const string &fieldName,
+		      const intrusive_ptr<const Value> &pValue);
+
+	/*
+	  Convenience type for dealing with fields.
+	 */
+	typedef pair<string, intrusive_ptr<const Value> > FieldPair;
+
+	/*
+	  Get the indicated field.
+
+	  @param index the field index in the list of fields
+	  @returns the field name and value of the field
+	 */
+	FieldPair getField(size_t index) const;
+
+	/*
+	  Get the number of fields in the Document.
+
+	  @returns the number of fields in the Document
+	 */
+	size_t getFieldCount() const;
+
+	/*
+	  Get the index of the given field.
+
+	  @param fieldName the name of the field
+	  @returns the index of the field, or if it does not exist, the number
+	    of fields (getFieldCount())
+	*/
+	size_t getFieldIndex(const string &fieldName) const;
+
+	/*
+	  Get a field by name.
+
+	  @param fieldName the name of the field
+	  @returns the value of the field
+	*/
+	intrusive_ptr<const Value> getField(const string &fieldName) const;
+
+	/*
+	  Get the approximate storage size of the document, in bytes.
+
+	  Under the assumption that field name strings are shared, they are
+	  not included in the total.
+
+	  @returns the approximate storage
+	*/
+	size_t getApproximateSize() const;
+
+        /*
+          Compare two documents.
+
+          BSON document field order is significant, so this just goes through
+          the fields in order.  The comparison is done in roughly the same way
+          as strings are compared, but comparing one field at a time instead
+          of one character at a time.
+        */
+        static int compare(const intrusive_ptr<Document> &rL,
+                           const intrusive_ptr<Document> &rR);
+
+	static string idName; // shared "_id"
+
+	/*
+	  Calculate a hash value.
+
+	  Meant to be used to create composite hashes suitable for
+	  boost classes such as unordered_map<>.
+
+	  @param seed value to augment with this' hash
+	*/
+	void hash_combine(size_t &seed) const;
+
+    private:
+        friend class FieldIterator;
+
+        Document(size_t sizeHint);
+        Document(BSONObj *pBsonObj);
+
+        /* these two vectors parallel each other */
+        vector<string> vFieldName;
+        vector<intrusive_ptr<const Value> > vpValue;
+    };
+
+
+    class FieldIterator :
+            boost::noncopyable {
+    public:
+        /*
+          Ask if there are more fields to return.
+
+          @return true if there are more fields, false otherwise
+        */
+        bool more() const;
+
+        /*
+          Move the iterator to point to the next field and return it.
+
+          @return the next field's <name, Value>
+        */
+	Document::FieldPair next();
+
+    private:
+        friend class Document;
+
+        /*
+          Constructor.
+
+          @param pDocument points to the document whose fields are being
+              iterated
+        */
+        FieldIterator(const intrusive_ptr<Document> &pDocument);
+
+        /*
+          We'll hang on to the original document to ensure we keep the
+          fieldPtr vector alive.
+        */
+	intrusive_ptr<Document> pDocument;
+        size_t index; // current field in iteration
+    };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline size_t Document::getFieldCount() const {
+	return vFieldName.size();
+    }
+    
+    inline Document::FieldPair Document::getField(size_t index) const {
+        assert( index < vFieldName.size() );
+        return FieldPair(vFieldName[index], vpValue[index]);
+    }
+
+}
diff --git a/src/mongo/db/pipeline/document_source.cpp b/src/mongo/db/pipeline/document_source.cpp
new file mode 100755
index 00000000000..813852e35c6
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source.cpp
@@ -0,0 +1,52 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+namespace mongo {
+    DocumentSource::~DocumentSource() {
+    }
+
+    void DocumentSource::setSource(
+	const intrusive_ptr<DocumentSource> &pTheSource) {
+	assert(!pSource.get());
+	pSource = pTheSource;
+    }
+
+    bool DocumentSource::coalesce(
+	const intrusive_ptr<DocumentSource> &pNextSource) {
+	return false;
+    }
+
+    void DocumentSource::optimize() {
+    }
+
+    void DocumentSource::addToBsonArray(BSONArrayBuilder *pBuilder) const {
+	BSONObjBuilder insides;
+	sourceToBson(&insides);
+	pBuilder->append(insides.done());
+    }
+
+    void DocumentSource::writeString(stringstream &ss) const {
+	BSONArrayBuilder bab;
+	addToBsonArray(&bab);
+	BSONArray ba(bab.arr());
+	ss << ba.toString(/* isArray */true); 
+            // our toString should use standard string types.....
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source.h b/src/mongo/db/pipeline/document_source.h
new file mode 100755
index 00000000000..8d5f0f70847
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source.h
@@ -0,0 +1,985 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include <boost/unordered_map.hpp>
+#include "util/intrusive_counter.h"
+#include "client/parallel.h"
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+#include "util/string_writer.h"
+
+namespace mongo {
+    class Accumulator;
+    class Cursor;
+    class Document;
+    class Expression;
+    class ExpressionContext;
+    class ExpressionFieldPath;
+    class ExpressionObject;
+    class Matcher;
+
+    class DocumentSource :
+        public IntrusiveCounterUnsigned,
+	public StringWriter {
+    public:
+	virtual ~DocumentSource();
+
+	// virtuals from StringWriter
+	/*
+	  Write out a string representation of this pipeline operator.
+
+	  @param ss string stream to write the string representation to
+	 */
+	virtual void writeString(stringstream &ss) const;
+
+
+        /*
+	  Is the source at EOF?
+
+	  @returns true if the source has no more Documents to return.
+        */
+        virtual bool eof() = 0;
+
+        /*
+	  Advance the state of the DocumentSource so that it will return the
+	  next Document.
+
+	  @returns whether there is another document to fetch, i.e., whether or
+	    not getCurrent() will succeed.
+        */
+        virtual bool advance() = 0;
+
+        /*
+          Advance the source, and return the next Expression.
+
+	  @returns the current Document
+          TODO throws an exception if there are no more expressions to return.
+        */
+        virtual intrusive_ptr<Document> getCurrent() = 0;
+
+	/*
+	  Set the underlying source this source should use to get Documents
+	  from.
+
+	  It is an error to set the source more than once.  This is to
+	  prevent changing sources once the original source has been started;
+	  this could break the state maintained by the DocumentSource.
+
+	  @param pSource the underlying source to use
+	 */
+	virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+	/*
+	  Attempt to coalesce this DocumentSource with its successor in the
+	  document processing pipeline.  If successful, the successor
+	  DocumentSource should be removed from the pipeline and discarded.
+
+	  If successful, this operation can be applied repeatedly, in an
+	  attempt to coalesce several sources together.
+
+	  The default implementation is to do nothing, and return false.
+
+	  @param pNextSource the next source in the document processing chain.
+	  @returns whether or not the attempt to coalesce was successful or not;
+	    if the attempt was not successful, nothing has been changed
+	 */
+	virtual bool coalesce(const intrusive_ptr<DocumentSource> &pNextSource);
+
+	/*
+	  Optimize the pipeline operation, if possible.  This is a local
+	  optimization that only looks within this DocumentSource.  For best
+	  results, first coalesce compatible sources using coalesce().
+
+	  This is intended for any operations that include expressions, and
+	  provides a hook for those to optimize those operations.
+
+	  The default implementation is to do nothing.
+	 */
+	virtual void optimize();
+
+        /*
+	  Add the DocumentSource to the array builder.
+
+	  The default implementation calls sourceToBson() in order to
+	  convert the inner part of the object which will be added to the
+	  array being built here.
+
+	  @param pBuilder the array builder to add the operation to.
+         */
+	virtual void addToBsonArray(BSONArrayBuilder *pBuilder) const;
+	
+    protected:
+	/*
+	  Create an object that represents the document source.  The object
+	  will have a single field whose name is the source's name.  This
+	  will be used by the default implementation of addToBsonArray()
+	  to add this object to a pipeline being represented in BSON.
+
+	  @param pBuilder a blank object builder to write to
+	 */
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const = 0;
+
+	/*
+	  Most DocumentSources have an underlying source they get their data
+	  from.  This is a convenience for them.
+
+	  The default implementation of setSource() sets this; if you don't
+	  need a source, override that to assert().  The default is to
+	  assert() if this has already been set.
+	*/
+	intrusive_ptr<DocumentSource> pSource;
+    };
+
+
+    class DocumentSourceBsonArray :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceBsonArray();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+	virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+	/*
+	  Create a document source based on a BSON array.
+
+	  This is usually put at the beginning of a chain of document sources
+	  in order to fetch data from the database.
+
+	  CAUTION:  the BSON is not read until the source is used.  Any
+	  elements that appear after these documents must not be read until
+	  this source is exhausted.
+
+	  @param pBsonElement the BSON array to treat as a document source
+	  @returns the newly created document source
+	*/
+	static intrusive_ptr<DocumentSourceBsonArray> create(
+	    BSONElement *pBsonElement);
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceBsonArray(BSONElement *pBsonElement);
+
+	BSONObj embeddedObject;
+	BSONObjIterator arrayIterator;
+	BSONElement currentElement;
+	bool haveCurrent;
+    };
+
+    
+    class DocumentSourceCommandFutures :
+	public DocumentSource {
+    public:
+	// virtuals from DocumentSource
+	virtual ~DocumentSourceCommandFutures();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+	virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+	/* convenient shorthand for a commonly used type */
+	typedef list<shared_ptr<Future::CommandResult> > FuturesList;
+
+	/*
+	  Create a DocumentSource that wraps a list of Command::Futures.
+
+	  @param errmsg place to write error messages to; must exist for the
+	    lifetime of the created DocumentSourceCommandFutures
+	  @param pList the list of futures
+	 */
+	static intrusive_ptr<DocumentSourceCommandFutures> create(
+	    string &errmsg, FuturesList *pList);
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+	DocumentSourceCommandFutures(string &errmsg, FuturesList *pList);
+
+	/*
+	  Advance to the next document, setting pCurrent appropriately.
+
+	  Adjusts pCurrent, pBsonSource, and iterator, as needed.  On exit,
+	  pCurrent is the Document to return, or NULL.  If NULL, this
+	  indicates there is nothing more to return.
+	 */
+	void getNextDocument();
+
+	bool newSource; // set to true for the first item of a new source
+	intrusive_ptr<DocumentSourceBsonArray> pBsonSource;
+	intrusive_ptr<Document> pCurrent;
+	FuturesList::iterator iterator;
+	FuturesList::iterator listEnd;
+	string &errmsg;
+    };
+
+
+    class DocumentSourceCursor :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceCursor();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+	virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+	/*
+	  Create a document source based on a cursor.
+
+	  This is usually put at the beginning of a chain of document sources
+	  in order to fetch data from the database.
+
+	  @param pCursor the cursor to use to fetch data
+	*/
+	static intrusive_ptr<DocumentSourceCursor> create(
+	    const shared_ptr<Cursor> &pCursor);
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceCursor(const shared_ptr<Cursor> &pTheCursor);
+
+	void findNext();
+        shared_ptr<Cursor> pCursor;
+	intrusive_ptr<Document> pCurrent;
+    };
+
+
+    /*
+      This contains all the basic mechanics for filtering a stream of
+      Documents, except for the actual predicate evaluation itself.  This was
+      factored out so we could create DocumentSources that use both Matcher
+      style predicates as well as full Expressions.
+     */
+    class DocumentSourceFilterBase :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceFilterBase();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+
+	/*
+	  Create a BSONObj suitable for Matcher construction.
+
+	  This is used after filter analysis has moved as many filters to
+	  as early a point as possible in the document processing pipeline.
+	  See db/Matcher.h and the associated wiki documentation for the
+	  format.  This conversion is used to move back to the low-level
+	  find() Cursor mechanism.
+
+	  @param pBuilder the builder to write to
+	 */
+	virtual void toMatcherBson(BSONObjBuilder *pBuilder) const = 0;
+
+    protected:
+        DocumentSourceFilterBase();
+
+	/*
+	  Test the given document against the predicate and report if it
+	  should be accepted or not.
+
+	  @param pDocument the document to test
+	  @returns true if the document matches the filter, false otherwise
+	 */
+	virtual bool accept(const intrusive_ptr<Document> &pDocument) const = 0;
+
+    private:
+
+        void findNext();
+
+        bool unstarted;
+        bool hasNext;
+        intrusive_ptr<Document> pCurrent;
+    };
+
+
+    class DocumentSourceFilter :
+        public DocumentSourceFilterBase {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceFilter();
+	virtual bool coalesce(const intrusive_ptr<DocumentSource> &pNextSource);
+	virtual void optimize();
+
+	/*
+	  Create a filter.
+
+          @param pBsonElement the raw BSON specification for the filter
+          @returns the filter
+	 */
+	static intrusive_ptr<DocumentSource> createFromBson(
+	    BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+        /*
+          Create a filter.
+
+          @param pFilter the expression to use to filter
+          @returns the filter
+         */
+        static intrusive_ptr<DocumentSourceFilter> create(
+            const intrusive_ptr<Expression> &pFilter);
+
+	/*
+	  Create a BSONObj suitable for Matcher construction.
+
+	  This is used after filter analysis has moved as many filters to
+	  as early a point as possible in the document processing pipeline.
+	  See db/Matcher.h and the associated wiki documentation for the
+	  format.  This conversion is used to move back to the low-level
+	  find() Cursor mechanism.
+
+	  @param pBuilder the builder to write to
+	 */
+	void toMatcherBson(BSONObjBuilder *pBuilder) const;
+
+	static const char filterName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+	// virtuals from DocumentSourceFilterBase
+	virtual bool accept(const intrusive_ptr<Document> &pDocument) const;
+
+    private:
+        DocumentSourceFilter(const intrusive_ptr<Expression> &pFilter);
+
+        intrusive_ptr<Expression> pFilter;
+    };
+
+
+    class DocumentSourceGroup :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceGroup();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+
+        /*
+          Create a new grouping DocumentSource.
+	  
+	  @param pCtx the expression context
+	  @returns the DocumentSource
+         */
+        static intrusive_ptr<DocumentSourceGroup> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+        /*
+          Set the Id Expression.
+
+          Documents that pass through the grouping Document are grouped
+          according to this key.  This will generate the id_ field in the
+          result documents.
+
+          @param pExpression the group key
+         */
+        void setIdExpression(const intrusive_ptr<Expression> &pExpression);
+
+        /*
+          Add an accumulator.
+
+          Accumulators become fields in the Documents that result from
+          grouping.  Each unique group document must have it's own
+          accumulator; the accumulator factory is used to create that.
+
+          @param fieldName the name the accumulator result will have in the
+                result documents
+          @param pAccumulatorFactory used to create the accumulator for the
+                group field
+         */
+        void addAccumulator(string fieldName,
+			    intrusive_ptr<Accumulator> (*pAccumulatorFactory)(
+			    const intrusive_ptr<ExpressionContext> &),
+                            const intrusive_ptr<Expression> &pExpression);
+
+	/*
+	  Create a grouping DocumentSource from BSON.
+
+	  This is a convenience method that uses the above, and operates on
+	  a BSONElement that has been deteremined to be an Object with an
+	  element named $group.
+
+	  @param pBsonElement the BSONELement that defines the group
+	  @param pCtx the expression context
+	  @returns the grouping DocumentSource
+	 */
+        static intrusive_ptr<DocumentSource> createFromBson(
+	    BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+	/*
+	  Create a unifying group that can be used to combine group results
+	  from shards.
+
+	  @returns the grouping DocumentSource
+	*/
+	intrusive_ptr<DocumentSource> createMerger();
+
+	static const char groupName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceGroup(const intrusive_ptr<ExpressionContext> &pCtx);
+
+	/*
+	  Before returning anything, this source must fetch everything from
+	  the underlying source and group it.  populate() is used to do that
+	  on the first call to any method on this source.  The populated
+	  boolean indicates that this has been done.
+	 */
+        void populate();
+        bool populated;
+
+        intrusive_ptr<Expression> pIdExpression;
+
+	typedef boost::unordered_map<intrusive_ptr<const Value>,
+	    vector<intrusive_ptr<Accumulator> >, Value::Hash> GroupsType;
+        GroupsType groups;
+
+        /*
+          The field names for the result documents and the accumulator
+          factories for the result documents.  The Expressions are the
+          common expressions used by each instance of each accumulator
+          in order to find the right-hand side of what gets added to the
+          accumulator.  Note that each of those is the same for each group,
+          so we can share them across all groups by adding them to the
+          accumulators after we use the factories to make a new set of
+          accumulators for each new group.
+
+          These three vectors parallel each other.
+        */
+        vector<string> vFieldName;
+        vector<intrusive_ptr<Accumulator> (*)(
+	    const intrusive_ptr<ExpressionContext> &)> vpAccumulatorFactory;
+        vector<intrusive_ptr<Expression> > vpExpression;
+
+
+        intrusive_ptr<Document> makeDocument(
+	    const GroupsType::iterator &rIter);
+
+        GroupsType::iterator groupsIterator;
+        intrusive_ptr<Document> pCurrent;
+
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+
+    class DocumentSourceMatch :
+        public DocumentSourceFilterBase {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceMatch();
+
+	/*
+	  Create a filter.
+
+          @param pBsonElement the raw BSON specification for the filter
+          @returns the filter
+	 */
+	static intrusive_ptr<DocumentSource> createFromBson(
+	    BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+	/*
+	  Create a BSONObj suitable for Matcher construction.
+
+	  This is used after filter analysis has moved as many filters to
+	  as early a point as possible in the document processing pipeline.
+	  See db/Matcher.h and the associated wiki documentation for the
+	  format.  This conversion is used to move back to the low-level
+	  find() Cursor mechanism.
+
+	  @param pBuilder the builder to write to
+	 */
+	void toMatcherBson(BSONObjBuilder *pBuilder) const;
+
+	static const char matchName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+	// virtuals from DocumentSourceFilterBase
+	virtual bool accept(const intrusive_ptr<Document> &pDocument) const;
+
+    private:
+        DocumentSourceMatch(const BSONObj &query);
+
+	Matcher matcher;
+    };
+
+
+    class DocumentSourceOut :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceOut();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+
+	/*
+	  Create a document source for output and pass-through.
+
+	  This can be put anywhere in a pipeline and will store content as
+	  well as pass it on.
+
+	  @returns the newly created document source
+	*/
+	static intrusive_ptr<DocumentSourceOut> createFromBson(
+	    BSONElement *pBsonElement);
+
+	static const char outName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceOut(BSONElement *pBsonElement);
+    };
+
+    
+    class DocumentSourceProject :
+        public DocumentSource,
+        public boost::enable_shared_from_this<DocumentSourceProject> {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceProject();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+	virtual void optimize();
+
+        /*
+          Create a new DocumentSource that can implement projection.
+
+	  @returns the projection DocumentSource
+        */
+        static intrusive_ptr<DocumentSourceProject> create();
+
+	/*
+	  Include a field path in a projection.
+
+	  @param fieldPath the path of the field to include
+	*/
+	void includePath(const string &fieldPath);
+
+	/*
+	  Exclude a field path from the projection.
+
+	  @param fieldPath the path of the field to exclude
+	 */
+	void excludePath(const string &fieldPath);
+
+        /*
+          Add an output Expression in the projection.
+
+          BSON document fields are ordered, so the new field will be
+          appended to the existing set.
+
+          @param fieldName the name of the field as it will appear
+          @param pExpression the expression used to compute the field
+        */
+        void addField(const string &fieldName,
+		      const intrusive_ptr<Expression> &pExpression);
+
+	/*
+	  Create a new projection DocumentSource from BSON.
+
+	  This is a convenience for directly handling BSON, and relies on the
+	  above methods.
+
+	  @param pBsonElement the BSONElement with an object named $project
+	  @returns the created projection
+	 */
+        static intrusive_ptr<DocumentSource> createFromBson(
+            BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+	static const char projectName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceProject();
+
+        // configuration state
+	bool excludeId;
+	intrusive_ptr<ExpressionObject> pEO;
+    };
+
+
+    class DocumentSourceSort :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceSort();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+	/*
+	  TODO
+	  Adjacent sorts should reduce to the last sort.
+	virtual bool coalesce(const intrusive_ptr<DocumentSource> &pNextSource);
+	*/
+
+        /*
+          Create a new sorting DocumentSource.
+	  
+	  @param pCtx the expression context
+	  @returns the DocumentSource
+         */
+        static intrusive_ptr<DocumentSourceSort> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+	/*
+	  Add sort key field.
+
+	  Adds a sort key field to the key being built up.  A concatenated
+	  key is built up by calling this repeatedly.
+
+	  @param fieldPath the field path to the key component
+	  @param ascending if true, use the key for an ascending sort,
+	    otherwise, use it for descending
+	*/
+	void addKey(const string &fieldPath, bool ascending);
+
+	/*
+	  Write out an object whose contents are the sort key.
+
+	  @param pBuilder initialized object builder.
+	  @param fieldPrefix specify whether or not to include the field prefix
+	 */
+	void sortKeyToBson(BSONObjBuilder *pBuilder, bool usePrefix) const;
+
+	/*
+	  Create a sorting DocumentSource from BSON.
+
+	  This is a convenience method that uses the above, and operates on
+	  a BSONElement that has been deteremined to be an Object with an
+	  element named $group.
+
+	  @param pBsonElement the BSONELement that defines the group
+	  @param pCtx the expression context
+	  @returns the grouping DocumentSource
+	 */
+        static intrusive_ptr<DocumentSource> createFromBson(
+	    BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+	static const char sortName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceSort(const intrusive_ptr<ExpressionContext> &pCtx);
+
+	/*
+	  Before returning anything, this source must fetch everything from
+	  the underlying source and group it.  populate() is used to do that
+	  on the first call to any method on this source.  The populated
+	  boolean indicates that this has been done.
+	 */
+        void populate();
+        bool populated;
+        long long count;
+
+	/* these two parallel each other */
+	vector<intrusive_ptr<ExpressionFieldPath> > vSortKey;
+	vector<bool> vAscending;
+
+	class Carrier {
+	public:
+	    /*
+	      We need access to the key for compares, so we have to carry
+	      this around.
+	    */
+	    DocumentSourceSort *pSort;
+
+	    intrusive_ptr<Document> pDocument;
+
+	    Carrier(DocumentSourceSort *pSort,
+		    const intrusive_ptr<Document> &pDocument);
+
+	    static bool lessThan(const Carrier &rL, const Carrier &rR);
+	};
+
+	/*
+	  Compare two documents according to the specified sort key.
+
+	  @param rL reference to the left document
+	  @param rR reference to the right document
+	  @returns a number less than, equal to, or greater than zero,
+	    indicating pL < pR, pL == pR, or pL > pR, respectively
+	 */
+	int compare(const intrusive_ptr<Document> &pL,
+		    const intrusive_ptr<Document> &pR);
+
+	typedef list<Carrier> ListType;
+	ListType documents;
+
+        ListType::iterator listIterator;
+        intrusive_ptr<Document> pCurrent;
+
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+
+    class DocumentSourceLimit :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceLimit();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+
+        /*
+          Create a new limiting DocumentSource.
+
+	  @param pCtx the expression context
+	  @returns the DocumentSource
+         */
+        static intrusive_ptr<DocumentSourceLimit> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+	/*
+	  Create a limiting DocumentSource from BSON.
+
+	  This is a convenience method that uses the above, and operates on
+	  a BSONElement that has been deteremined to be an Object with an
+	  element named $limit.
+
+	  @param pBsonElement the BSONELement that defines the limit
+	  @param pCtx the expression context
+	  @returns the grouping DocumentSource
+	 */
+        static intrusive_ptr<DocumentSource> createFromBson(
+	    BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+	static const char limitName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceLimit(const intrusive_ptr<ExpressionContext> &pCtx);
+
+        long long limit;
+        long long count;
+        intrusive_ptr<Document> pCurrent;
+
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+    class DocumentSourceSkip :
+        public DocumentSource {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceSkip();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+
+        /*
+          Create a new skipping DocumentSource.
+
+	  @param pCtx the expression context
+	  @returns the DocumentSource
+         */
+        static intrusive_ptr<DocumentSourceSkip> create(
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+	/*
+	  Create a skipping DocumentSource from BSON.
+
+	  This is a convenience method that uses the above, and operates on
+	  a BSONElement that has been deteremined to be an Object with an
+	  element named $skip.
+
+	  @param pBsonElement the BSONELement that defines the skip
+	  @param pCtx the expression context
+	  @returns the grouping DocumentSource
+	 */
+        static intrusive_ptr<DocumentSource> createFromBson(
+	    BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+	static const char skipName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceSkip(const intrusive_ptr<ExpressionContext> &pCtx);
+
+        /*
+          Skips initial documents.
+         */
+        void skipper();
+
+        long long skip;
+        long long count;
+        intrusive_ptr<Document> pCurrent;
+
+	intrusive_ptr<ExpressionContext> pCtx;
+    };
+
+
+    class DocumentSourceUnwind :
+        public DocumentSource,
+        public boost::enable_shared_from_this<DocumentSourceUnwind> {
+    public:
+        // virtuals from DocumentSource
+        virtual ~DocumentSourceUnwind();
+        virtual bool eof();
+        virtual bool advance();
+        virtual intrusive_ptr<Document> getCurrent();
+
+        /*
+          Create a new DocumentSource that can implement unwind.
+
+	  @returns the projection DocumentSource
+        */
+        static intrusive_ptr<DocumentSourceUnwind> create();
+
+        /*
+	  Specify the field to unwind.  There must be exactly one before
+	  the pipeline begins execution.
+
+	  @param rFieldPath - path to the field to unwind
+        */
+	void unwindField(const FieldPath &rFieldPath);
+
+	/*
+	  Create a new projection DocumentSource from BSON.
+
+	  This is a convenience for directly handling BSON, and relies on the
+	  above methods.
+
+	  @param pBsonElement the BSONElement with an object named $project
+	  @returns the created projection
+	 */
+        static intrusive_ptr<DocumentSource> createFromBson(
+            BSONElement *pBsonElement,
+	    const intrusive_ptr<ExpressionContext> &pCtx);
+
+	static const char unwindName[];
+
+    protected:
+	// virtuals from DocumentSource
+	virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+    private:
+        DocumentSourceUnwind();
+
+        // configuration state
+	FieldPath unwindPath;
+
+	vector<int> fieldIndex; /* for the current document, the indices
+				   leading down to the field being unwound */
+
+        // iteration state
+        intrusive_ptr<Document> pNoUnwindDocument;
+                                              // document to return, pre-unwind
+        intrusive_ptr<const Value> pUnwindArray; // field being unwound
+        intrusive_ptr<ValueIterator> pUnwinder; // iterator used for unwinding
+        intrusive_ptr<const Value> pUnwindValue; // current value
+
+	/*
+	  Clear all the state related to unwinding an array.
+	 */
+	void resetArray();
+
+	/*
+	  Clone the current document being unwound.
+
+	  This is a partial deep clone.  Because we're going to replace the
+	  value at the end, we have to replace everything along the path
+	  leading to that in order to not share that change with any other
+	  clones (or the original) that we've made.
+
+	  This expects pUnwindValue to have been set by a prior call to
+	  advance().  However, pUnwindValue may also be NULL, in which case
+	  the field will be removed -- this is the action for an empty
+	  array.
+
+	  @returns a partial deep clone of pNoUnwindDocument
+	 */
+	intrusive_ptr<Document> clonePath() const;
+
+    };
+
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline void DocumentSourceGroup::setIdExpression(
+        const intrusive_ptr<Expression> &pExpression) {
+        pIdExpression = pExpression;
+    }
+
+    inline void DocumentSourceUnwind::resetArray() {
+	pNoUnwindDocument.reset();
+	pUnwindArray.reset();
+	pUnwinder.reset();
+	pUnwindValue.reset();
+    }
+
+    inline DocumentSourceSort::Carrier::Carrier(
+	DocumentSourceSort *pTheSort,
+	const intrusive_ptr<Document> &pTheDocument):
+	pSort(pTheSort),
+	pDocument(pTheDocument) {
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_bson_array.cpp b/src/mongo/db/pipeline/document_source_bson_array.cpp
new file mode 100755
index 00000000000..5d187b03ef9
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_bson_array.cpp
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/pipeline/document.h"
+
+namespace mongo {
+
+    DocumentSourceBsonArray::~DocumentSourceBsonArray() {
+    }
+
+    bool DocumentSourceBsonArray::eof() {
+	return !haveCurrent;
+    }
+
+    bool DocumentSourceBsonArray::advance() {
+	if (eof())
+	    return false;
+
+	if (!arrayIterator.more()) {
+	    haveCurrent = false;
+	    return false;
+	}
+
+	currentElement = arrayIterator.next();
+	return true;
+    }
+
+    intrusive_ptr<Document> DocumentSourceBsonArray::getCurrent() {
+	assert(haveCurrent);
+        BSONObj documentObj(currentElement.Obj());
+        intrusive_ptr<Document> pDocument(
+            Document::createFromBsonObj(&documentObj));
+        return pDocument;
+    }
+
+    void DocumentSourceBsonArray::setSource(
+	const intrusive_ptr<DocumentSource> &pSource) {
+	/* this doesn't take a source */
+	assert(false);
+    }
+
+    DocumentSourceBsonArray::DocumentSourceBsonArray(
+	BSONElement *pBsonElement):
+        embeddedObject(pBsonElement->embeddedObject()),
+        arrayIterator(embeddedObject),
+        haveCurrent(false) {
+	if (arrayIterator.more()) {
+	    currentElement = arrayIterator.next();
+	    haveCurrent = true;
+	}
+    }
+
+    intrusive_ptr<DocumentSourceBsonArray> DocumentSourceBsonArray::create(
+	BSONElement *pBsonElement) {
+
+	assert(pBsonElement->type() == Array);
+	intrusive_ptr<DocumentSourceBsonArray> pSource(
+	    new DocumentSourceBsonArray(pBsonElement));
+
+	return pSource;
+    }
+
+    void DocumentSourceBsonArray::sourceToBson(BSONObjBuilder *pBuilder) const {
+	assert(false); // this has no analog in the BSON world
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_command_futures.cpp b/src/mongo/db/pipeline/document_source_command_futures.cpp
new file mode 100755
index 00000000000..61a257cf16f
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_command_futures.cpp
@@ -0,0 +1,132 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+namespace mongo {
+
+    DocumentSourceCommandFutures::~DocumentSourceCommandFutures() {
+    }
+
+    bool DocumentSourceCommandFutures::eof() {
+	/* if we haven't even started yet, do so */
+	if (!pCurrent.get())
+	    getNextDocument();
+
+	return (pCurrent.get() == NULL);
+    }
+
+    bool DocumentSourceCommandFutures::advance() {
+	if (eof())
+	    return false;
+
+	/* advance */
+	getNextDocument();
+
+	return (pCurrent.get() != NULL);
+    }
+
+    intrusive_ptr<Document> DocumentSourceCommandFutures::getCurrent() {
+	assert(!eof());
+	return pCurrent;
+    }
+
+    void DocumentSourceCommandFutures::setSource(
+	const intrusive_ptr<DocumentSource> &pSource) {
+	/* this doesn't take a source */
+	assert(false);
+    }
+
+    void DocumentSourceCommandFutures::sourceToBson(
+	BSONObjBuilder *pBuilder) const {
+        /* this has no BSON equivalent */
+	assert(false);
+    }
+
+    DocumentSourceCommandFutures::DocumentSourceCommandFutures(
+	string &theErrmsg, FuturesList *pList):
+        newSource(false),
+        pBsonSource(),
+        pCurrent(),
+        iterator(pList->begin()),
+        listEnd(pList->end()),
+        errmsg(theErrmsg) {
+    }
+
+    intrusive_ptr<DocumentSourceCommandFutures>
+    DocumentSourceCommandFutures::create(
+	string &errmsg, FuturesList *pList) {
+	intrusive_ptr<DocumentSourceCommandFutures> pSource(
+	    new DocumentSourceCommandFutures(errmsg, pList));
+	return pSource;
+    }
+
+    void DocumentSourceCommandFutures::getNextDocument() {
+	while(true) {
+	    if (!pBsonSource.get()) {
+		/* if there aren't any more futures, we're done */
+		if (iterator == listEnd) {
+		    pCurrent.reset();
+		    return;
+		}
+
+		/* grab the next command result */
+		shared_ptr<Future::CommandResult> pResult(*iterator);
+		++iterator;
+
+		/* try to wait for it */
+		if (!pResult->join()) {
+		    error() << "sharded pipeline failed on shard: " <<
+			pResult->getServer() << " error: " <<
+			pResult->result() << endl;
+		    errmsg += "-- mongod pipeline failed: ";
+		    errmsg += pResult->result().toString();
+
+		    /* move on to the next command future */
+		    continue;
+		}
+
+		/* grab the result array out of the shard server's response */
+		BSONObj shardResult(pResult->result());
+		BSONObjIterator objIterator(shardResult);
+		while(objIterator.more()) {
+		    BSONElement element(objIterator.next());
+		    const char *pFieldName = element.fieldName();
+
+		    /* find the result array and quit this loop */
+		    if (strcmp(pFieldName, "result") == 0) {
+			pBsonSource = DocumentSourceBsonArray::create(&element);
+			newSource = true;
+			break;
+		    }
+		}
+	    }
+
+	    /* if we're done with this shard's results, try the next */
+	    if (pBsonSource->eof() ||
+		(!newSource && !pBsonSource->advance())) {
+		pBsonSource.reset();
+		continue;
+	    }
+
+	    pCurrent = pBsonSource->getCurrent();
+	    newSource = false;
+	    return;
+	}
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_filter.cpp b/src/mongo/db/pipeline/document_source_filter.cpp
new file mode 100755
index 00000000000..66e57ba2e93
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_filter.cpp
@@ -0,0 +1,98 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    const char DocumentSourceFilter::filterName[] = "$filter";
+
+    DocumentSourceFilter::~DocumentSourceFilter() {
+    }
+
+    bool DocumentSourceFilter::coalesce(
+	const intrusive_ptr<DocumentSource> &pNextSource) {
+
+	/* we only know how to coalesce other filters */
+	DocumentSourceFilter *pDocFilter =
+	    dynamic_cast<DocumentSourceFilter *>(pNextSource.get());
+	if (!pDocFilter)
+	    return false;
+
+	/*
+	  Two adjacent filters can be combined by creating a conjunction of
+	  their predicates.
+	 */
+	intrusive_ptr<ExpressionNary> pAnd(ExpressionAnd::create());
+	pAnd->addOperand(pFilter);
+	pAnd->addOperand(pDocFilter->pFilter);
+	pFilter = pAnd;
+
+	return true;
+    }
+
+    void DocumentSourceFilter::optimize() {
+	pFilter = pFilter->optimize();
+    }
+
+    void DocumentSourceFilter::sourceToBson(BSONObjBuilder *pBuilder) const {
+	pFilter->addToBsonObj(pBuilder, filterName, 0);
+    }
+
+    bool DocumentSourceFilter::accept(
+	const intrusive_ptr<Document> &pDocument) const {
+	intrusive_ptr<const Value> pValue(pFilter->evaluate(pDocument));
+	return pValue->coerceToBool();
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceFilter::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	uassert(15946, "a document filter expression must be an object",
+		pBsonElement->type() == Object);
+
+	Expression::ObjectCtx oCtx(0);
+        intrusive_ptr<Expression> pExpression(
+	    Expression::parseObject(pBsonElement, &oCtx));
+        intrusive_ptr<DocumentSourceFilter> pFilter(
+            DocumentSourceFilter::create(pExpression));
+
+        return pFilter;
+    }
+
+    intrusive_ptr<DocumentSourceFilter> DocumentSourceFilter::create(
+        const intrusive_ptr<Expression> &pFilter) {
+        intrusive_ptr<DocumentSourceFilter> pSource(
+            new DocumentSourceFilter(pFilter));
+        return pSource;
+    }
+
+    DocumentSourceFilter::DocumentSourceFilter(
+        const intrusive_ptr<Expression> &pTheFilter):
+	DocumentSourceFilterBase(),
+        pFilter(pTheFilter) {
+    }
+
+    void DocumentSourceFilter::toMatcherBson(BSONObjBuilder *pBuilder) const {
+	pFilter->toMatcherBson(pBuilder, 0);
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_filter_base.cpp b/src/mongo/db/pipeline/document_source_filter_base.cpp
new file mode 100755
index 00000000000..dbda34b7151
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_filter_base.cpp
@@ -0,0 +1,85 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    DocumentSourceFilterBase::~DocumentSourceFilterBase() {
+    }
+
+    void DocumentSourceFilterBase::findNext() {
+        /* only do this the first time */
+        if (unstarted) {
+            hasNext = !pSource->eof();
+            unstarted = false;
+        }
+
+        while(hasNext) {
+            boost::intrusive_ptr<Document> pDocument(pSource->getCurrent());
+            hasNext = pSource->advance();
+
+            if (accept(pDocument)) {
+                pCurrent = pDocument;
+                return;
+            }
+        }
+
+        pCurrent.reset();
+    }
+
+    bool DocumentSourceFilterBase::eof() {
+        if (unstarted)
+            findNext();
+
+        return (pCurrent.get() == NULL);
+    }
+
+    bool DocumentSourceFilterBase::advance() {
+        if (unstarted)
+            findNext();
+
+        /*
+          This looks weird after the above, but is correct.  Note that calling
+          getCurrent() when first starting already yields the first document
+          in the collection.  Calling advance() without using getCurrent()
+          first will skip over the first item.
+         */
+        findNext();
+
+        return (pCurrent.get() != NULL);
+    }
+
+    boost::intrusive_ptr<Document> DocumentSourceFilterBase::getCurrent() {
+        if (unstarted)
+            findNext();
+
+        assert(pCurrent.get() != NULL);
+        return pCurrent;
+    }
+
+    DocumentSourceFilterBase::DocumentSourceFilterBase():
+        unstarted(true),
+        hasNext(false),
+        pCurrent() {
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_group.cpp b/src/mongo/db/pipeline/document_source_group.cpp
new file mode 100755
index 00000000000..244561589da
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_group.cpp
@@ -0,0 +1,391 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/accumulator.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+    const char DocumentSourceGroup::groupName[] = "$group";
+
+    DocumentSourceGroup::~DocumentSourceGroup() {
+    }
+
+    bool DocumentSourceGroup::eof() {
+        if (!populated)
+            populate();
+
+        return (groupsIterator == groups.end());
+    }
+
+    bool DocumentSourceGroup::advance() {
+        if (!populated)
+            populate();
+
+        assert(groupsIterator != groups.end());
+
+        ++groupsIterator;
+        if (groupsIterator == groups.end()) {
+            pCurrent.reset();
+            return false;
+        }
+
+        pCurrent = makeDocument(groupsIterator);
+        return true;
+    }
+
+    intrusive_ptr<Document> DocumentSourceGroup::getCurrent() {
+        if (!populated)
+            populate();
+
+        return pCurrent;
+    }
+
+    void DocumentSourceGroup::sourceToBson(BSONObjBuilder *pBuilder) const {
+	BSONObjBuilder insides;
+
+	/* add the _id */
+	pIdExpression->addToBsonObj(&insides, Document::idName.c_str(), 0);
+
+	/* add the remaining fields */
+	const size_t n = vFieldName.size();
+	for(size_t i = 0; i < n; ++i) {
+	    intrusive_ptr<Accumulator> pA((*vpAccumulatorFactory[i])(pCtx));
+	    pA->addOperand(vpExpression[i]);
+	    pA->addToBsonObj(&insides, vFieldName[i], 0);
+	}
+
+	pBuilder->append(groupName, insides.done());
+    }
+
+    intrusive_ptr<DocumentSourceGroup> DocumentSourceGroup::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+        intrusive_ptr<DocumentSourceGroup> pSource(
+            new DocumentSourceGroup(pCtx));
+        return pSource;
+    }
+
+    DocumentSourceGroup::DocumentSourceGroup(
+	const intrusive_ptr<ExpressionContext> &pTheCtx):
+        populated(false),
+        pIdExpression(),
+        groups(),
+        vFieldName(),
+        vpAccumulatorFactory(),
+        vpExpression(),
+        pCtx(pTheCtx) {
+    }
+
+    void DocumentSourceGroup::addAccumulator(
+        string fieldName,
+        intrusive_ptr<Accumulator> (*pAccumulatorFactory)(
+	    const intrusive_ptr<ExpressionContext> &),
+        const intrusive_ptr<Expression> &pExpression) {
+        vFieldName.push_back(fieldName);
+        vpAccumulatorFactory.push_back(pAccumulatorFactory);
+        vpExpression.push_back(pExpression);
+    }
+
+
+    struct GroupOpDesc {
+        const char *pName;
+        intrusive_ptr<Accumulator> (*pFactory)(
+	    const intrusive_ptr<ExpressionContext> &);
+    };
+
+    static int GroupOpDescCmp(const void *pL, const void *pR) {
+        return strcmp(((const GroupOpDesc *)pL)->pName,
+                      ((const GroupOpDesc *)pR)->pName);
+    }
+
+    /*
+      Keep these sorted alphabetically so we can bsearch() them using
+      GroupOpDescCmp() above.
+    */
+    static const GroupOpDesc GroupOpTable[] = {
+        {"$addToSet", AccumulatorAddToSet::create},
+        {"$avg", AccumulatorAvg::create},
+        {"$first", AccumulatorFirst::create},
+        {"$last", AccumulatorLast::create},
+        {"$max", AccumulatorMinMax::createMax},
+        {"$min", AccumulatorMinMax::createMin},
+        {"$push", AccumulatorPush::create},
+        {"$sum", AccumulatorSum::create},
+    };
+
+    static const size_t NGroupOp = sizeof(GroupOpTable)/sizeof(GroupOpTable[0]);
+
+    intrusive_ptr<DocumentSource> DocumentSourceGroup::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	uassert(15947, "a group's fields must be specified in an object",
+		pBsonElement->type() == Object);
+
+        intrusive_ptr<DocumentSourceGroup> pGroup(
+	    DocumentSourceGroup::create(pCtx));
+        bool idSet = false;
+
+        BSONObj groupObj(pBsonElement->Obj());
+        BSONObjIterator groupIterator(groupObj);
+        while(groupIterator.more()) {
+            BSONElement groupField(groupIterator.next());
+            const char *pFieldName = groupField.fieldName();
+
+            if (strcmp(pFieldName, Document::idName.c_str()) == 0) {
+		uassert(15948, "a group's _id may only be specified once",
+			!idSet);
+
+		BSONType groupType = groupField.type();
+
+		if (groupType == Object) {
+		    /*
+		      Use the projection-like set of field paths to create the
+		      group-by key.
+		    */
+		    Expression::ObjectCtx oCtx(
+			Expression::ObjectCtx::DOCUMENT_OK);
+		    intrusive_ptr<Expression> pId(
+			Expression::parseObject(&groupField, &oCtx));
+
+		    pGroup->setIdExpression(pId);
+		    idSet = true;
+		}
+		else if (groupType == String) {
+		    string groupString(groupField.String());
+		    const char *pGroupString = groupString.c_str();
+		    if ((groupString.length() == 0) ||
+			(pGroupString[0] != '$'))
+			goto StringConstantId;
+
+		    string pathString(
+			Expression::removeFieldPrefix(groupString));
+		    intrusive_ptr<ExpressionFieldPath> pFieldPath(
+			ExpressionFieldPath::create(pathString));
+		    pGroup->setIdExpression(pFieldPath);
+		    idSet = true;
+		}
+		else {
+		    /* pick out the constant types that are allowed */
+		    switch(groupType) {
+		    case NumberDouble:
+		    case String:
+		    case Object:
+		    case Array:
+		    case jstOID:
+		    case Bool:
+		    case Date:
+		    case NumberInt:
+		    case Timestamp:
+		    case NumberLong:
+		    case jstNULL:
+		    StringConstantId: // from string case above
+		    {
+			intrusive_ptr<const Value> pValue(
+			    Value::createFromBsonElement(&groupField));
+			intrusive_ptr<ExpressionConstant> pConstant(
+			    ExpressionConstant::create(pValue));
+			pGroup->setIdExpression(pConstant);
+			idSet = true;
+			break;
+		    }
+
+		    default:
+			uassert(15949, str::stream() <<
+				"a group's _id may not include fields of BSON type " << groupType,
+				false);
+		    }
+		}
+            }
+            else {
+                /*
+                  Treat as a projection field with the additional ability to
+                  add aggregation operators.
+                */
+		uassert(15950, str::stream() <<
+			"the group aggregate field name " <<
+			*pFieldName << " cannot be an operator name",
+			*pFieldName != '$');
+
+		uassert(15951, str::stream() << 
+			"the group aggregate field " << *pFieldName <<
+			"must be defined as an expression inside an object",
+			groupField.type() == Object);
+
+                BSONObj subField(groupField.Obj());
+                BSONObjIterator subIterator(subField);
+                size_t subCount = 0;
+                for(; subIterator.more(); ++subCount) {
+                    BSONElement subElement(subIterator.next());
+
+                    /* look for the specified operator */
+                    GroupOpDesc key;
+                    key.pName = subElement.fieldName();
+                    const GroupOpDesc *pOp =
+			(const GroupOpDesc *)bsearch(
+                              &key, GroupOpTable, NGroupOp, sizeof(GroupOpDesc),
+                                      GroupOpDescCmp);
+
+		    uassert(15952, str::stream() <<
+			    "unknown group operator \"" <<
+			    key.pName << "\"",
+			    pOp);
+
+                    intrusive_ptr<Expression> pGroupExpr;
+
+                    BSONType elementType = subElement.type();
+                    if (elementType == Object) {
+			Expression::ObjectCtx oCtx(
+			    Expression::ObjectCtx::DOCUMENT_OK);
+                        pGroupExpr = Expression::parseObject(
+			    &subElement, &oCtx);
+		    }
+                    else if (elementType == Array) {
+			uassert(15953, str::stream() <<
+				"aggregating group operators are unary (" <<
+				key.pName << ")", false);
+                    }
+                    else { /* assume its an atomic single operand */
+                        pGroupExpr = Expression::parseOperand(&subElement);
+                    }
+
+                    pGroup->addAccumulator(
+                        pFieldName, pOp->pFactory, pGroupExpr);
+                }
+
+		uassert(15954, str::stream() <<
+			"the computed aggregate \"" <<
+			pFieldName << "\" must specify exactly one operator",
+			subCount == 1);
+            }
+        }
+
+	uassert(15955, "a group specification must include an _id", idSet);
+
+        return pGroup;
+    }
+
+    void DocumentSourceGroup::populate() {
+        for(bool hasNext = !pSource->eof(); hasNext;
+                hasNext = pSource->advance()) {
+            intrusive_ptr<Document> pDocument(pSource->getCurrent());
+
+            /* get the _id document */
+            intrusive_ptr<const Value> pId(pIdExpression->evaluate(pDocument));
+	    uassert(15956, "the _id field for a group must not be undefined",
+		    pId->getType() != Undefined);
+
+            /*
+              Look for the _id value in the map; if it's not there, add a
+	      new entry with a blank accumulator.
+            */
+            vector<intrusive_ptr<Accumulator> > *pGroup;
+            GroupsType::iterator it(groups.find(pId));
+            if (it != groups.end()) {
+                /* point at the existing accumulators */
+                pGroup = &it->second;
+            }
+            else {
+                /* insert a new group into the map */
+                groups.insert(it,
+                              pair<intrusive_ptr<const Value>,
+                              vector<intrusive_ptr<Accumulator> > >(
+                                  pId, vector<intrusive_ptr<Accumulator> >()));
+
+                /* find the accumulator vector (the map value) */
+                it = groups.find(pId);
+                pGroup = &it->second;
+
+                /* add the accumulators */
+                const size_t n = vpAccumulatorFactory.size();
+                pGroup->reserve(n);
+                for(size_t i = 0; i < n; ++i) {
+                    intrusive_ptr<Accumulator> pAccumulator(
+                        (*vpAccumulatorFactory[i])(pCtx));
+                    pAccumulator->addOperand(vpExpression[i]);
+                    pGroup->push_back(pAccumulator);
+                }
+            }
+
+            /* point at the existing key */
+            // unneeded atm // pId = it.first;
+
+            /* tickle all the accumulators for the group we found */
+            const size_t n = pGroup->size();
+            for(size_t i = 0; i < n; ++i)
+                (*pGroup)[i]->evaluate(pDocument);
+        }
+
+        /* start the group iterator */
+        groupsIterator = groups.begin();
+        if (groupsIterator != groups.end())
+            pCurrent = makeDocument(groupsIterator);
+        populated = true;
+    }
+
+    intrusive_ptr<Document> DocumentSourceGroup::makeDocument(
+        const GroupsType::iterator &rIter) {
+        vector<intrusive_ptr<Accumulator> > *pGroup = &rIter->second;
+        const size_t n = vFieldName.size();
+        intrusive_ptr<Document> pResult(Document::create(1 + n));
+
+        /* add the _id field */
+        pResult->addField(Document::idName, rIter->first);
+
+        /* add the rest of the fields */
+        for(size_t i = 0; i < n; ++i) {
+	    intrusive_ptr<const Value> pValue((*pGroup)[i]->getValue());
+	    if (pValue->getType() != Undefined)
+		pResult->addField(vFieldName[i], pValue);
+	}
+
+        return pResult;
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceGroup::createMerger() {
+	intrusive_ptr<DocumentSourceGroup> pMerger(
+	    DocumentSourceGroup::create(pCtx));
+
+	/* the merger will use the same grouping key */
+	pMerger->setIdExpression(ExpressionFieldPath::create(
+				     Document::idName.c_str()));
+
+	const size_t n = vFieldName.size();
+	for(size_t i = 0; i < n; ++i) {
+	    /*
+	      The merger's output field names will be the same, as will the
+	      accumulator factories.  However, for some accumulators, the
+	      expression to be accumulated will be different.  The original
+	      accumulator may be collecting an expression based on a field
+	      expression or constant.  Here, we accumulate the output of the
+	      same name from the prior group.
+	    */
+	    pMerger->addAccumulator(
+		vFieldName[i], vpAccumulatorFactory[i],
+		ExpressionFieldPath::create(vFieldName[i]));
+	}
+
+	return pMerger;
+    }
+}
+
+
diff --git a/src/mongo/db/pipeline/document_source_limit.cpp b/src/mongo/db/pipeline/document_source_limit.cpp
new file mode 100644
index 00000000000..a73d4da2005
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_limit.cpp
@@ -0,0 +1,83 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+    const char DocumentSourceLimit::limitName[] = "$limit";
+
+    DocumentSourceLimit::DocumentSourceLimit(const intrusive_ptr<ExpressionContext> &pTheCtx):
+        limit(0),
+        count(0),
+        pCtx(pTheCtx) {
+    }
+
+    DocumentSourceLimit::~DocumentSourceLimit() {
+    }
+
+    bool DocumentSourceLimit::eof() {
+        return pSource->eof() || count >= limit;
+    }
+
+    bool DocumentSourceLimit::advance() {
+        ++count;
+        if (count >= limit) {
+            pCurrent.reset();
+            return false;
+        }
+        pCurrent = pSource->getCurrent();
+        return pSource->advance();
+    }
+
+    intrusive_ptr<Document> DocumentSourceLimit::getCurrent() {
+        return pSource->getCurrent();
+    }
+
+    void DocumentSourceLimit::sourceToBson(BSONObjBuilder *pBuilder) const {
+	pBuilder->append("$limit", limit);
+    }
+
+    intrusive_ptr<DocumentSourceLimit> DocumentSourceLimit::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+        intrusive_ptr<DocumentSourceLimit> pSource(
+            new DocumentSourceLimit(pCtx));
+        return pSource;
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceLimit::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	uassert(15957, "the limit must be specified as a number",
+		pBsonElement->isNumber());
+
+        intrusive_ptr<DocumentSourceLimit> pLimit(
+	    DocumentSourceLimit::create(pCtx));
+
+        pLimit->limit = (int)pBsonElement->numberLong();
+	uassert(15958, "the limit must be positive",
+		pLimit->limit > 0);
+
+        return pLimit;
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_match.cpp b/src/mongo/db/pipeline/document_source_match.cpp
new file mode 100755
index 00000000000..bedac3ef717
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_match.cpp
@@ -0,0 +1,80 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/matcher.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+
+namespace mongo {
+
+    const char DocumentSourceMatch::matchName[] = "$match";
+
+    DocumentSourceMatch::~DocumentSourceMatch() {
+    }
+
+    void DocumentSourceMatch::sourceToBson(BSONObjBuilder *pBuilder) const {
+	const BSONObj *pQuery = matcher.getQuery();
+	pBuilder->append(matchName, *pQuery);
+    }
+
+    bool DocumentSourceMatch::accept(
+	const intrusive_ptr<Document> &pDocument) const {
+
+	/*
+	  The matcher only takes BSON documents, so we have to make one.
+
+	  LATER
+	  We could optimize this by making a document with only the
+	  fields referenced by the Matcher.  We could do this by looking inside
+	  the Matcher's BSON before it is created, and recording those.  The
+	  easiest implementation might be to hold onto an ExpressionDocument
+	  in here, and give that pDocument to create the created subset of
+	  fields, and then convert that instead.
+	*/
+	BSONObjBuilder objBuilder;
+	pDocument->toBson(&objBuilder);
+	BSONObj obj(objBuilder.done());
+
+	return matcher.matches(obj);
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceMatch::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	uassert(15959, "the match filter must be an expression in an object",
+		pBsonElement->type() == Object);
+
+        intrusive_ptr<DocumentSourceMatch> pMatcher(
+	    new DocumentSourceMatch(pBsonElement->Obj()));
+
+        return pMatcher;
+    }
+
+    void DocumentSourceMatch::toMatcherBson(BSONObjBuilder *pBuilder) const {
+	const BSONObj *pQuery = matcher.getQuery();
+	pBuilder->appendElements(*pQuery);
+    }
+
+    DocumentSourceMatch::DocumentSourceMatch(const BSONObj &query):
+	DocumentSourceFilterBase(),
+        matcher(query) {
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_out.cpp b/src/mongo/db/pipeline/document_source_out.cpp
new file mode 100755
index 00000000000..5a30342d25c
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_out.cpp
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+
+namespace mongo {
+
+    const char DocumentSourceOut::outName[] = "$out";
+
+    DocumentSourceOut::~DocumentSourceOut() {
+    }
+
+    bool DocumentSourceOut::eof() {
+	return pSource->eof();
+    }
+
+    bool DocumentSourceOut::advance() {
+	return pSource->advance();
+    }
+
+    boost::intrusive_ptr<Document> DocumentSourceOut::getCurrent() {
+	return pSource->getCurrent();
+    }
+
+    DocumentSourceOut::DocumentSourceOut(BSONElement *pBsonElement) {
+	assert(false && "unimplemented");
+    }
+
+    intrusive_ptr<DocumentSourceOut> DocumentSourceOut::createFromBson(
+	BSONElement *pBsonElement) {
+	intrusive_ptr<DocumentSourceOut> pSource(
+	    new DocumentSourceOut(pBsonElement));
+
+	return pSource;
+    }
+
+    void DocumentSourceOut::sourceToBson(BSONObjBuilder *pBuilder) const {
+	assert(false); // CW TODO
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_project.cpp b/src/mongo/db/pipeline/document_source_project.cpp
new file mode 100755
index 00000000000..bb7a0b5a6d9
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_project.cpp
@@ -0,0 +1,201 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    const char DocumentSourceProject::projectName[] = "$project";
+
+    DocumentSourceProject::~DocumentSourceProject() {
+    }
+
+    DocumentSourceProject::DocumentSourceProject():
+	excludeId(false),
+	pEO(ExpressionObject::create()) {
+    }
+
+    bool DocumentSourceProject::eof() {
+        return pSource->eof();
+    }
+
+    bool DocumentSourceProject::advance() {
+        return pSource->advance();
+    }
+
+    intrusive_ptr<Document> DocumentSourceProject::getCurrent() {
+	intrusive_ptr<Document> pInDocument(pSource->getCurrent());
+
+	/* create the result document */
+	const size_t sizeHint =
+	    pEO->getSizeHint(pInDocument) + (excludeId ? 0 : 1);
+	intrusive_ptr<Document> pResultDocument(Document::create(sizeHint));
+
+	if (!excludeId) {
+	    intrusive_ptr<const Value> pId(
+		pInDocument->getField(Document::idName));
+	    pResultDocument->addField(Document::idName, pId);
+	}
+
+	/* use the ExpressionObject to create the base result */
+	pEO->addToDocument(pResultDocument, pInDocument);
+
+        return pResultDocument;
+    }
+
+    void DocumentSourceProject::optimize() {
+	intrusive_ptr<Expression> pE(pEO->optimize());
+	pEO = dynamic_pointer_cast<ExpressionObject>(pE);
+    }
+
+    void DocumentSourceProject::sourceToBson(BSONObjBuilder *pBuilder) const {
+	BSONObjBuilder insides;
+	if (excludeId)
+	    insides.append(Document::idName, false);
+	pEO->documentToBson(&insides, 0);
+	pBuilder->append(projectName, insides.done());
+    }
+
+    intrusive_ptr<DocumentSourceProject> DocumentSourceProject::create() {
+        intrusive_ptr<DocumentSourceProject> pSource(
+            new DocumentSourceProject());
+        return pSource;
+    }
+
+    void DocumentSourceProject::addField(
+        const string &fieldName, const intrusive_ptr<Expression> &pExpression) {
+	uassert(15960,
+		"projection fields must be defined by non-empty expressions",
+		pExpression);
+
+	pEO->addField(fieldName, pExpression);
+    }
+
+    void DocumentSourceProject::includePath(const string &fieldPath) {
+	if (Document::idName.compare(fieldPath) == 0) {
+	    uassert(15961, str::stream() << projectName <<
+		    ":  _id cannot be included once it has been excluded",
+		    !excludeId);
+
+	    return;
+	}
+
+	pEO->includePath(fieldPath);
+    }
+
+    void DocumentSourceProject::excludePath(const string &fieldPath) {
+	if (Document::idName.compare(fieldPath) == 0) {
+	    excludeId = true;
+	    return;
+	}
+
+	pEO->excludePath(fieldPath);
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceProject::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+        /* validate */
+	uassert(15969, str::stream() << projectName <<
+		" specification must be an object",
+		pBsonElement->type() == Object);
+
+        /* chain the projection onto the original source */
+        intrusive_ptr<DocumentSourceProject> pProject(
+	    DocumentSourceProject::create());
+
+        /*
+          Pull out the $project object.  This should just be a list of
+          field inclusion or exclusion specifications.  Note you can't do
+          both, except for the case of _id.
+         */
+        BSONObj projectObj(pBsonElement->Obj());
+        BSONObjIterator fieldIterator(projectObj);
+	Expression::ObjectCtx objectCtx(
+	    Expression::ObjectCtx::DOCUMENT_OK);
+        while(fieldIterator.more()) {
+            BSONElement outFieldElement(fieldIterator.next());
+            string outFieldPath(outFieldElement.fieldName());
+            string inFieldName(outFieldPath);
+            BSONType specType = outFieldElement.type();
+            int fieldInclusion = -1;
+
+            switch(specType) {
+            case NumberDouble: {
+                double inclusion = outFieldElement.numberDouble();
+		fieldInclusion = static_cast<int>(inclusion);
+                goto IncludeExclude;
+            }
+
+            case NumberInt:
+                /* just a plain integer include/exclude specification */
+                fieldInclusion = outFieldElement.numberInt();
+
+IncludeExclude:
+		uassert(15970, str::stream() <<
+			"field inclusion or exclusion specification for \"" <<
+			outFieldPath <<
+			"\" must be true, 1, false, or zero",
+			((fieldInclusion == 0) || (fieldInclusion == 1)));
+
+                if (fieldInclusion == 0)
+		    pProject->excludePath(outFieldPath);
+                else 
+                    pProject->includePath(outFieldPath);
+                break;
+
+            case Bool:
+                /* just a plain boolean include/exclude specification */
+                fieldInclusion = (outFieldElement.Bool() ? 1 : 0);
+                goto IncludeExclude;
+
+            case String:
+                /* include a field, with rename */
+                fieldInclusion = 1;
+                inFieldName = outFieldElement.String();
+		pProject->addField(
+		    outFieldPath,
+		    ExpressionFieldPath::create(
+			Expression::removeFieldPrefix(inFieldName)));
+		break;
+
+            case Object: {
+                intrusive_ptr<Expression> pDocument(
+                    Expression::parseObject(&outFieldElement, &objectCtx));
+
+                /* add The document expression to the projection */
+                pProject->addField(outFieldPath, pDocument);
+                break;
+            }
+
+            default:
+		uassert(15971, str::stream() <<
+			"invalid BSON type (" << specType <<
+			") for " << projectName <<
+			" field " << outFieldPath, false);
+            }
+
+        }
+
+        return pProject;
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_skip.cpp b/src/mongo/db/pipeline/document_source_skip.cpp
new file mode 100644
index 00000000000..74bf2360ce9
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_skip.cpp
@@ -0,0 +1,99 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+    const char DocumentSourceSkip::skipName[] = "$skip";
+
+    DocumentSourceSkip::DocumentSourceSkip(const intrusive_ptr<ExpressionContext> &pTheCtx):
+        skip(0),
+        count(0),
+        pCtx(pTheCtx) {
+    }
+
+    DocumentSourceSkip::~DocumentSourceSkip() {
+    }
+
+    void DocumentSourceSkip::skipper() {
+        if (count == 0) {
+            while (!pSource->eof() && count++ < skip) {
+                pSource->advance();
+            }
+        }
+
+        if (pSource->eof()) {
+            pCurrent.reset();
+            return;
+        }
+
+        pCurrent = pSource->getCurrent();
+    }
+
+    bool DocumentSourceSkip::eof() {
+        skipper();
+        return pSource->eof();
+    }
+
+    bool DocumentSourceSkip::advance() {
+        if (eof()) {
+            pCurrent.reset();
+            return false;
+        }
+
+        pCurrent = pSource->getCurrent();
+        return pSource->advance();
+    }
+
+    intrusive_ptr<Document> DocumentSourceSkip::getCurrent() {
+        skipper();
+        return pCurrent;
+    }
+
+    void DocumentSourceSkip::sourceToBson(BSONObjBuilder *pBuilder) const {
+	pBuilder->append("$skip", skip);
+    }
+
+    intrusive_ptr<DocumentSourceSkip> DocumentSourceSkip::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+        intrusive_ptr<DocumentSourceSkip> pSource(
+            new DocumentSourceSkip(pCtx));
+        return pSource;
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceSkip::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	uassert(15972, str::stream() << "the value to " <<
+		skipName << " must be a number", pBsonElement->isNumber());
+
+        intrusive_ptr<DocumentSourceSkip> pSkip(
+	    DocumentSourceSkip::create(pCtx));
+
+        pSkip->skip = (int)pBsonElement->numberLong();
+        assert(pSkip->skip > 0); // CW TODO error code
+
+        return pSkip;
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_sort.cpp b/src/mongo/db/pipeline/document_source_sort.cpp
new file mode 100755
index 00000000000..bf4739af7d1
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_sort.cpp
@@ -0,0 +1,216 @@
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/doc_mem_monitor.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+
+namespace mongo {
+    const char DocumentSourceSort::sortName[] = "$sort";
+
+    DocumentSourceSort::~DocumentSourceSort() {
+    }
+
+    bool DocumentSourceSort::eof() {
+        if (!populated)
+            populate();
+
+        return (listIterator == documents.end());
+    }
+
+    bool DocumentSourceSort::advance() {
+        if (!populated)
+            populate();
+
+        assert(listIterator != documents.end());
+
+        ++listIterator;
+        if (listIterator == documents.end()) {
+            pCurrent.reset();
+            count = 0;
+            return false;
+        }
+	pCurrent = listIterator->pDocument;
+
+        return true;
+    }
+
+    intrusive_ptr<Document> DocumentSourceSort::getCurrent() {
+        if (!populated)
+            populate();
+
+        return pCurrent;
+    }
+
+    void DocumentSourceSort::sourceToBson(BSONObjBuilder *pBuilder) const {
+	BSONObjBuilder insides;
+	sortKeyToBson(&insides, false);
+	pBuilder->append(sortName, insides.done());
+    }
+
+    intrusive_ptr<DocumentSourceSort> DocumentSourceSort::create(
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+        intrusive_ptr<DocumentSourceSort> pSource(
+            new DocumentSourceSort(pCtx));
+        return pSource;
+    }
+
+    DocumentSourceSort::DocumentSourceSort(
+	const intrusive_ptr<ExpressionContext> &pTheCtx):
+        populated(false),
+        pCtx(pTheCtx) {
+    }
+
+    void DocumentSourceSort::addKey(const string &fieldPath, bool ascending) {
+	intrusive_ptr<ExpressionFieldPath> pE(
+	    ExpressionFieldPath::create(fieldPath));
+	vSortKey.push_back(pE);
+	vAscending.push_back(ascending);
+    }
+
+    void DocumentSourceSort::sortKeyToBson(
+	BSONObjBuilder *pBuilder, bool usePrefix) const {
+	/* add the key fields */
+	const size_t n = vSortKey.size();
+	for(size_t i = 0; i < n; ++i) {
+	    /* create the "field name" */
+	    stringstream ss;
+	    vSortKey[i]->writeFieldPath(ss, usePrefix);
+
+	    /* append a named integer based on the sort order */
+	    pBuilder->append(ss.str(), (vAscending[i] ? 1 : -1));
+	}
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceSort::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+	uassert(15973, str::stream() << " the " <<
+		sortName << " key specification must be an object",
+		pBsonElement->type() == Object);
+
+        intrusive_ptr<DocumentSourceSort> pSort(
+	    DocumentSourceSort::create(pCtx));
+
+        /* check for then iterate over the sort object */
+	size_t sortKeys = 0;
+	for(BSONObjIterator keyIterator(pBsonElement->Obj().begin());
+	    keyIterator.more();) {
+	    BSONElement keyField(keyIterator.next());
+	    const char *pKeyFieldName = keyField.fieldName();
+	    int sortOrder = 0;
+		
+	    uassert(15974, str::stream() << sortName <<
+		    " key ordering must be specified using a number",
+		    keyField.isNumber());
+	    sortOrder = (int)keyField.numberInt();
+
+	    uassert(15975,  str::stream() << sortName <<
+		    " key ordering must be 1 (for ascending) or -1 (for descending",
+		    ((sortOrder == 1) || (sortOrder == -1)));
+
+	    pSort->addKey(pKeyFieldName, (sortOrder > 0));
+	    ++sortKeys;
+	}
+
+	uassert(15976, str::stream() << sortName <<
+		" must have at least one sort key", (sortKeys > 0));
+
+        return pSort;
+    }
+
+    void DocumentSourceSort::populate() {
+	/* make sure we've got a sort key */
+	assert(vSortKey.size());
+
+	/* track and warn about how much physical memory has been used */
+	DocMemMonitor dmm(this);
+
+	/* pull everything from the underlying source */
+        for(bool hasNext = !pSource->eof(); hasNext;
+	    hasNext = pSource->advance()) {
+	    intrusive_ptr<Document> pDocument(pSource->getCurrent());
+	    documents.push_back(Carrier(this, pDocument));
+
+	    dmm.addToTotal(pDocument->getApproximateSize());
+	}
+
+	/* sort the list */
+	documents.sort(Carrier::lessThan);
+
+        /* start the sort iterator */
+        listIterator = documents.begin();
+
+        if (listIterator != documents.end())
+            pCurrent = listIterator->pDocument;
+        populated = true;
+    }
+
+    int DocumentSourceSort::compare(
+	const intrusive_ptr<Document> &pL, const intrusive_ptr<Document> &pR) {
+
+	/*
+	  populate() already checked that there is a non-empty sort key,
+	  so we shouldn't have to worry about that here.
+
+	  However, the tricky part is what to do is none of the sort keys are
+	  present.  In this case, consider the document less.
+	*/
+	const size_t n = vSortKey.size();
+	for(size_t i = 0; i < n; ++i) {
+	    /* evaluate the sort keys */
+	    ExpressionFieldPath *pE = vSortKey[i].get();
+	    intrusive_ptr<const Value> pLeft(pE->evaluate(pL));
+	    intrusive_ptr<const Value> pRight(pE->evaluate(pR));
+
+	    /*
+	      Compare the two values; if they differ, return.  If they are
+	      the same, move on to the next key.
+	    */
+	    int cmp = Value::compare(pLeft, pRight);
+	    if (cmp) {
+		/* if necessary, adjust the return value by the key ordering */
+		if (!vAscending[i])
+		    cmp = -cmp;
+
+		return cmp;
+	    }
+	}
+
+	/*
+	  If we got here, everything matched (or didn't exist), so we'll
+	  consider the documents equal for purposes of this sort.
+	*/
+	return 0;
+    }
+
+    bool DocumentSourceSort::Carrier::lessThan(
+	const Carrier &rL, const Carrier &rR) {
+	/* make sure these aren't from different lists */
+	assert(rL.pSort == rR.pSort);
+
+	/* compare the documents according to the sort key */
+	return (rL.pSort->compare(rL.pDocument, rR.pDocument) < 0);
+    }
+}
diff --git a/src/mongo/db/pipeline/document_source_unwind.cpp b/src/mongo/db/pipeline/document_source_unwind.cpp
new file mode 100755
index 00000000000..bb231451113
--- /dev/null
+++ b/src/mongo/db/pipeline/document_source_unwind.cpp
@@ -0,0 +1,234 @@
+/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+    const char DocumentSourceUnwind::unwindName[] = "$unwind";
+
+    DocumentSourceUnwind::~DocumentSourceUnwind() {
+    }
+
+    DocumentSourceUnwind::DocumentSourceUnwind():
+	unwindPath(),
+        pNoUnwindDocument(),
+        pUnwindArray(),
+        pUnwinder(),
+        pUnwindValue() {
+    }
+
+    bool DocumentSourceUnwind::eof() {
+        /*
+          If we're unwinding an array, and there are more elements, then we
+          can return more documents.
+        */
+        if (pUnwinder.get() && pUnwinder->more())
+            return false;
+
+        return pSource->eof();
+    }
+
+    bool DocumentSourceUnwind::advance() {
+        if (pUnwinder.get() && pUnwinder->more()) {
+            pUnwindValue = pUnwinder->next();
+            return true;
+        }
+
+        /* release the last document and advance */
+	resetArray();
+        return pSource->advance();
+    }
+
+    intrusive_ptr<Document> DocumentSourceUnwind::getCurrent() {
+        if (!pNoUnwindDocument.get()) {
+            intrusive_ptr<Document> pInDocument(pSource->getCurrent());
+
+	    /* create the result document */
+	    pNoUnwindDocument = pInDocument;
+	    fieldIndex.clear();
+
+	    /*
+	      First we'll look to see if the path is there.  If it isn't,
+	      we'll pass this document through.  If it is, we record the
+	      indexes of the fields down the field path so that we can
+	      quickly replace them as we clone the documents along the
+	      field path.
+
+	      We have to clone all the documents along the field path so
+	      that we don't share the end value across documents that have
+	      come out of this pipeline operator.
+	     */
+	    intrusive_ptr<Document> pCurrent(pInDocument);
+	    const size_t pathLength = unwindPath.getPathLength();
+	    for(size_t i = 0; i < pathLength; ++i) {
+		size_t idx = pCurrent->getFieldIndex(
+		    unwindPath.getFieldName(i));
+		if (idx == pCurrent->getFieldCount() ) {
+		    /* this document doesn't contain the target field */
+		    resetArray();
+		    return pInDocument;
+		    break;
+		}
+
+		fieldIndex.push_back(idx);
+		Document::FieldPair fp(pCurrent->getField(idx));
+		intrusive_ptr<const Value> pPathValue(fp.second);
+		if (i < pathLength - 1) {
+		    if (pPathValue->getType() != Object) {
+			/* can't walk down the field path */
+			resetArray();
+			uassert(15977, str::stream() << unwindName <<
+				":  cannot traverse field path past scalar value for \"" <<
+				fp.first << "\"", false);
+			break;
+		    }
+
+		    /* move down the object tree */
+		    pCurrent = pPathValue->getDocument();
+		}
+		else /* (i == pathLength - 1) */ {
+		    if (pPathValue->getType() != Array) {
+			/* last item on path must be an array to unwind */
+			resetArray();
+			uassert(15978, str::stream() << unwindName <<
+				":  value at end of field path must be an array",
+				false);
+			break;
+		    }
+
+		    /* keep track of the array we're unwinding */
+		    pUnwindArray = pPathValue;
+		    if (pUnwindArray->getArrayLength() == 0) {
+			/*
+			  The $unwind of an empty array is a NULL value.  If we
+			  encounter this, use the non-unwind path, but replace
+			  pOutField with a null.
+
+			  Make sure unwind value is clear so the array is
+			  removed.
+			*/
+			pUnwindValue.reset();
+			intrusive_ptr<Document> pClone(clonePath());
+			resetArray();
+			return pClone;
+		    }
+
+		    /* get the iterator we'll use to unwind the array */
+		    pUnwinder = pUnwindArray->getArray();
+		    assert(pUnwinder->more()); // we just checked above...
+		    pUnwindValue = pUnwinder->next();
+		}
+	    }
+	}
+
+        /*
+          If we're unwinding a field, create an alternate document.  In the
+          alternate (clone), replace the unwound array field with the element
+          at the appropriate index.
+         */
+        if (pUnwindArray.get()) {
+            /* clone the document with an array we're unwinding */
+            intrusive_ptr<Document> pUnwindDocument(clonePath());
+
+            return pUnwindDocument;
+        }
+
+        return pNoUnwindDocument;
+    }
+
+    intrusive_ptr<Document> DocumentSourceUnwind::clonePath() const {
+	/*
+	  For this to be valid, we must already have pNoUnwindDocument set,
+	  and have set up the vector of indices for that document in fieldIndex.
+	 */
+	assert(pNoUnwindDocument.get());
+	assert(pUnwinder.get());
+
+	intrusive_ptr<Document> pClone(pNoUnwindDocument->clone());
+	intrusive_ptr<Document> pCurrent(pClone);
+	const size_t n = fieldIndex.size();
+	assert(n);
+	for(size_t i = 0; i < n; ++i) {
+	    const size_t fi = fieldIndex[i];
+	    Document::FieldPair fp(pCurrent->getField(fi));
+	    if (i + 1 < n) {
+		/*
+		  For every object in the path but the last, clone it and
+		  continue on down.
+		*/
+		intrusive_ptr<Document> pNext(
+		    fp.second->getDocument()->clone());
+		pCurrent->setField(fi, fp.first, Value::createDocument(pNext));
+		pCurrent = pNext;
+	    }
+	    else {
+		/* for the last, subsitute the next unwound value */
+		pCurrent->setField(fi, fp.first, pUnwindValue);
+	    }
+	}
+
+	return pClone;
+    }
+
+    void DocumentSourceUnwind::sourceToBson(BSONObjBuilder *pBuilder) const {
+	pBuilder->append(unwindName, unwindPath.getPath(true));
+    }
+
+    intrusive_ptr<DocumentSourceUnwind> DocumentSourceUnwind::create() {
+        intrusive_ptr<DocumentSourceUnwind> pSource(
+            new DocumentSourceUnwind());
+        return pSource;
+    }
+
+    void DocumentSourceUnwind::unwindField(const FieldPath &rFieldPath) {
+	/* can't set more than one unwind field */
+	uassert(15979, str::stream() << unwindName <<
+		"can't unwind more than one path at once",
+		!unwindPath.getPathLength());
+
+	uassert(15980, "the path of the field to unwind cannot be empty",
+		false);
+
+	/* record the field path */
+	unwindPath = rFieldPath;
+    }
+
+    intrusive_ptr<DocumentSource> DocumentSourceUnwind::createFromBson(
+	BSONElement *pBsonElement,
+	const intrusive_ptr<ExpressionContext> &pCtx) {
+        /*
+	  The value of $unwind should just be a field path.
+         */
+	uassert(15981, str::stream() << "the " << unwindName <<
+		" field path must be specified as a string",
+		pBsonElement->type() == String);
+
+	string prefixedPathString(pBsonElement->String());
+	string pathString(Expression::removeFieldPrefix(prefixedPathString));
+        intrusive_ptr<DocumentSourceUnwind> pUnwind(
+	    DocumentSourceUnwind::create());
+	pUnwind->unwindPath = FieldPath(pathString);
+
+        return pUnwind;
+    }
+}
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp
new file mode 100755
index 00000000000..b3caefcf899
--- /dev/null
+++ b/src/mongo/db/pipeline/expression.cpp
@@ -0,0 +1,2815 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/expression.h"
+
+#include <cstdio>
+#include "db/jsobj.h"
+#include "db/pipeline/builder.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+    using namespace mongoutils;
+
+    /* --------------------------- Expression ------------------------------ */
+
+    void Expression::toMatcherBson(
+	BSONObjBuilder *pBuilder, unsigned depth) const {
+	assert(false && "Expression::toMatcherBson()");
+    }
+
+    Expression::ObjectCtx::ObjectCtx(int theOptions):
+        options(theOptions),
+        unwindField() {
+    }
+
+    void Expression::ObjectCtx::unwind(string fieldName) {
+        assert(unwindOk());
+        assert(!unwindUsed());
+        assert(fieldName.size());
+        unwindField = fieldName;
+    }
+
+    bool Expression::ObjectCtx::documentOk() const {
+        return ((options & DOCUMENT_OK) != 0);
+    }
+
+    const char Expression::unwindName[] = "$unwind";
+
+    string Expression::removeFieldPrefix(const string &prefixedField) {
+	const char *pPrefixedField = prefixedField.c_str();
+	uassert(15982, str::stream() <<
+		"field path references must be prefixed with a '$' (\"" <<
+		prefixedField << "\"", pPrefixedField[0] == '$');
+
+	return string(pPrefixedField + 1);
+    }
+
+    intrusive_ptr<Expression> Expression::parseObject(
+        BSONElement *pBsonElement, ObjectCtx *pCtx) {
+        /*
+          An object expression can take any of the following forms:
+
+          f0: {f1: ..., f2: ..., f3: ...}
+          f0: {$operator:[operand1, operand2, ...]}
+          f0: {$unwind:"fieldpath"}
+
+          We handle $unwind as a special case, because this is done by the
+          projection source.  For any other expression, we hand over control to
+          code that parses the expression and returns an expression.
+        */
+
+        intrusive_ptr<Expression> pExpression; // the result
+        intrusive_ptr<ExpressionObject> pExpressionObject; // alt result
+        int isOp = -1; /* -1 -> unknown, 0 -> not an operator, 1 -> operator */
+        enum { UNKNOWN, NOTOPERATOR, OPERATOR } kind = UNKNOWN;
+
+        BSONObj obj(pBsonElement->Obj());
+        BSONObjIterator iter(obj);
+        for(size_t fieldCount = 0; iter.more(); ++fieldCount) {
+            BSONElement fieldElement(iter.next());
+            const char *pFieldName = fieldElement.fieldName();
+
+            if (pFieldName[0] == '$') {
+		uassert(15983, str::stream() <<
+			"the operator must be the only field in a pipeline object (at \""
+			<< pFieldName << "\"",
+			fieldCount == 0);
+
+                /* we've determined this "object" is an operator expression */
+                isOp = 1;
+                kind = OPERATOR;
+
+                if (strcmp(pFieldName, unwindName) != 0) {
+                    pExpression = parseExpression(pFieldName, &fieldElement);
+                }
+                else {
+                    assert(pCtx->unwindOk());
+                    // CW TODO error: it's not OK to unwind in this context
+
+                    assert(!pCtx->unwindUsed());
+                    // CW TODO error: this projection already has an unwind
+
+                    assert(fieldElement.type() == String);
+                    // CW TODO $unwind operand must be single field name
+
+		    string fieldPath(removeFieldPrefix(fieldElement.String()));
+                    pExpression = ExpressionFieldPath::create(fieldPath);
+                    pCtx->unwind(fieldPath);
+                }
+            }
+            else {
+		uassert(15984, str::stream() << "this object is already an operator expression, and can't be used as a document expression (at \"" <<
+			pFieldName << "\")",
+			isOp != 1);
+		uassert(15990, str::stream() << "this object is already an operator expression, and can't be used as a document expression (at \"" <<
+			pFieldName << "\")",
+			kind != OPERATOR);
+
+                /* if it's our first time, create the document expression */
+                if (!pExpression.get()) {
+                    assert(pCtx->documentOk());
+                    // CW TODO error: document not allowed in this context
+
+                    pExpressionObject = ExpressionObject::create();
+                    pExpression = pExpressionObject;
+
+                    /* this "object" is not an operator expression */
+                    isOp = 0;
+                    kind = NOTOPERATOR;
+                }
+
+                BSONType fieldType = fieldElement.type();
+                string fieldName(pFieldName);
+                if (fieldType == Object) {
+                    /* it's a nested document */
+		    ObjectCtx oCtx(
+			(pCtx->documentOk() ? ObjectCtx::DOCUMENT_OK : 0));
+                    intrusive_ptr<Expression> pNested(
+                        parseObject(&fieldElement, &oCtx));
+                    pExpressionObject->addField(fieldName, pNested);
+                }
+                else if (fieldType == String) {
+                    /* it's a renamed field */
+		    // CW TODO could also be a constant
+                    intrusive_ptr<Expression> pPath(
+                        ExpressionFieldPath::create(
+			    removeFieldPrefix(fieldElement.String())));
+                    pExpressionObject->addField(fieldName, pPath);
+                }
+                else if (fieldType == NumberDouble) {
+                    /* it's an inclusion specification */
+                    int inclusion = static_cast<int>(fieldElement.Double());
+		    if (inclusion == 0)
+			pExpressionObject->excludePath(fieldName);
+		    else if (inclusion == 1)
+			pExpressionObject->includePath(fieldName);
+		    else
+			uassert(15991, str::stream() <<
+				"\"" << fieldName <<
+				"\" numeric inclusion or exclusion must be 1 or 0 (or boolean)",
+				false);
+                }
+                else if (fieldType == Bool) {
+		    bool inclusion = fieldElement.Bool();
+		    if (!inclusion)
+			pExpressionObject->excludePath(fieldName);
+		    else
+			pExpressionObject->includePath(fieldName);
+		}
+		else { /* nothing else is allowed */
+		    uassert(15992, str::stream() <<
+			    "disallowed field type " << fieldType <<
+			    " in object expression (at \"" <<
+			    fieldName << "\")", false);
+                }
+            }
+        }
+
+        return pExpression;
+    }
+
+
+    struct OpDesc {
+        const char *pName;
+        intrusive_ptr<ExpressionNary> (*pFactory)(void);
+    };
+
+    static int OpDescCmp(const void *pL, const void *pR) {
+        return strcmp(((const OpDesc *)pL)->pName, ((const OpDesc *)pR)->pName);
+    }
+
+    /*
+      Keep these sorted alphabetically so we can bsearch() them using
+      OpDescCmp() above.
+    */
+    static const OpDesc OpTable[] = {
+        {"$add", ExpressionAdd::create},
+        {"$and", ExpressionAnd::create},
+        {"$cmp", ExpressionCompare::createCmp},
+	{"$cond", ExpressionCond::create},
+	{"$const", ExpressionNoOp::create},
+        {"$dayOfMonth", ExpressionDayOfMonth::create},
+        {"$dayOfWeek", ExpressionDayOfWeek::create},
+        {"$dayOfYear", ExpressionDayOfYear::create},
+        {"$divide", ExpressionDivide::create},
+        {"$eq", ExpressionCompare::createEq},
+        {"$gt", ExpressionCompare::createGt},
+        {"$gte", ExpressionCompare::createGte},
+        {"$hour", ExpressionHour::create},
+        {"$ifNull", ExpressionIfNull::create},
+        {"$lt", ExpressionCompare::createLt},
+        {"$lte", ExpressionCompare::createLte},
+        {"$minute", ExpressionMinute::create},
+        {"$mod", ExpressionMod::create},
+        {"$month", ExpressionMonth::create},
+        {"$multiply", ExpressionMultiply::create},
+        {"$ne", ExpressionCompare::createNe},
+        {"$not", ExpressionNot::create},
+        {"$or", ExpressionOr::create},
+        {"$second", ExpressionSecond::create},
+        {"$strcasecmp", ExpressionStrcasecmp::create},
+        {"$substr", ExpressionSubstr::create},
+        {"$subtract", ExpressionSubtract::create},
+        {"$toLower", ExpressionToLower::create},
+        {"$toUpper", ExpressionToUpper::create},
+        {"$week", ExpressionWeek::create},
+        {"$year", ExpressionYear::create},
+    };
+
+    static const size_t NOp = sizeof(OpTable)/sizeof(OpTable[0]);
+
+    intrusive_ptr<Expression> Expression::parseExpression(
+        const char *pOpName, BSONElement *pBsonElement) {
+        /* look for the specified operator */
+        OpDesc key;
+        key.pName = pOpName;
+        const OpDesc *pOp = (const OpDesc *)bsearch(
+                                &key, OpTable, NOp, sizeof(OpDesc), OpDescCmp);
+
+	uassert(15999, str::stream() << "invalid operator \"" <<
+		pOpName << "\"", pOp);
+
+        /* make the expression node */
+        intrusive_ptr<ExpressionNary> pExpression((*pOp->pFactory)());
+
+        /* add the operands to the expression node */
+        BSONType elementType = pBsonElement->type();
+        if (elementType == Object) {
+            /* the operator must be unary and accept an object argument */
+            BSONObj objOperand(pBsonElement->Obj());
+	    ObjectCtx oCtx(ObjectCtx::DOCUMENT_OK);
+            intrusive_ptr<Expression> pOperand(
+                Expression::parseObject(pBsonElement, &oCtx));
+            pExpression->addOperand(pOperand);
+        }
+        else if (elementType == Array) {
+            /* multiple operands - an n-ary operator */
+            vector<BSONElement> bsonArray(pBsonElement->Array());
+            const size_t n = bsonArray.size();
+            for(size_t i = 0; i < n; ++i) {
+                BSONElement *pBsonOperand = &bsonArray[i];
+                intrusive_ptr<Expression> pOperand(
+		    Expression::parseOperand(pBsonOperand));
+                pExpression->addOperand(pOperand);
+            }
+        }
+        else { /* assume it's an atomic operand */
+            intrusive_ptr<Expression> pOperand(
+		Expression::parseOperand(pBsonElement));
+            pExpression->addOperand(pOperand);
+        }
+
+        return pExpression;
+    }
+
+    intrusive_ptr<Expression> Expression::parseOperand(BSONElement *pBsonElement) {
+        BSONType type = pBsonElement->type();
+
+        switch(type) {
+        case String: {
+            /*
+              This could be a field path, or it could be a constant
+              string.
+
+              We make a copy of the BSONElement reader so we can read its
+              value without advancing its state, in case we need to read it
+              again in the constant code path.
+            */
+            BSONElement opCopy(*pBsonElement);
+            string value(opCopy.String());
+
+            /* check for a field path */
+	    if (value[0] != '$')
+                goto ExpectConstant;  // assume plain string constant
+
+            /* if we got here, this is a field path expression */
+	    string fieldPath(removeFieldPrefix(value));
+            intrusive_ptr<Expression> pFieldExpr(
+                ExpressionFieldPath::create(fieldPath));
+            return pFieldExpr;
+        }
+
+        case Object: {
+	    ObjectCtx oCtx(ObjectCtx::DOCUMENT_OK);
+            intrusive_ptr<Expression> pSubExpression(
+                Expression::parseObject(pBsonElement, &oCtx));
+            return pSubExpression;
+        }
+
+        default:
+	ExpectConstant: {
+                intrusive_ptr<Expression> pOperand(
+                    ExpressionConstant::createFromBsonElement(pBsonElement));
+                return pOperand;
+            }
+
+        } // switch(type)
+
+        /* NOTREACHED */
+        assert(false);
+        return intrusive_ptr<Expression>();
+    }
+
+    /* ------------------------- ExpressionAdd ----------------------------- */
+
+    ExpressionAdd::~ExpressionAdd() {
+    }
+
+    intrusive_ptr<Expression> ExpressionAdd::optimize() {
+	intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+	ExpressionAdd *pA = dynamic_cast<ExpressionAdd *>(pE.get());
+	if (pA) {
+	    /* don't create a circular reference */
+	    if (pA != this)
+		pA->pAdd = this;
+	}
+
+	return pE;
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionAdd::create() {
+        intrusive_ptr<ExpressionAdd> pExpression(new ExpressionAdd());
+        return pExpression;
+    }
+
+    ExpressionAdd::ExpressionAdd():
+        ExpressionNary(),
+        useOriginal(false) {
+    }
+
+    intrusive_ptr<const Value> ExpressionAdd::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        unsigned stringCount = 0;
+	unsigned nonConstStringCount = 0;
+        unsigned dateCount = 0;
+        const size_t n = vpOperand.size();
+	vector<intrusive_ptr<const Value> > vpValue; /* evaluated operands */
+
+	/* use the original, if we've been told to do so */
+	if (useOriginal) {
+	    return pAdd->evaluate(pDocument);
+	}
+
+        for (size_t i = 0; i < n; ++i) {
+            intrusive_ptr<const Value> pValue(
+		vpOperand[i]->evaluate(pDocument));
+	    vpValue.push_back(pValue);
+
+	    BSONType valueType = pValue->getType();
+            if (valueType == String) {
+                ++stringCount;
+		if (!dynamic_cast<ExpressionConstant *>(vpOperand[i].get()))
+		    ++nonConstStringCount;
+	    }
+            else if (valueType == Date)
+                ++dateCount;
+        }
+
+        /* 
+	   We don't allow adding two dates because it doesn't make sense
+	   especially since they are in epoch time. However, if there is a
+	   string present then we would be appending the dates to a string so
+	   having many would not be not a problem.
+        */
+        if ((dateCount > 1) && !stringCount) {
+	    uassert(16000, "can't add two dates together", false);
+            return Value::getNull();
+        }
+
+	/*
+	  If there are non-constant strings, and we've got a copy of the
+	  original, then use that from this point forward.  This is necessary
+	  to keep the order of strings the same for string concatenation;
+	  constant-folding would violate the order preservation.
+
+	  This is a one-way conversion we do if we see one of these.  It is
+	  possible that these could vary from document to document, but any
+	  sane schema probably isn't going to do that, so once we see a string,
+	  we can probably assume they're going to be strings all the way down.
+	 */
+	if (nonConstStringCount && pAdd.get()) {
+	    useOriginal = true;
+	    return pAdd->evaluate(pDocument);
+	}
+
+        if (stringCount) {
+            stringstream stringTotal;
+            for (size_t i = 0; i < n; ++i) {
+                intrusive_ptr<const Value> pValue(vpValue[i]);
+                stringTotal << pValue->coerceToString();
+            }
+
+            return Value::createString(stringTotal.str());
+        }
+
+        if (dateCount) {
+            long long dateTotal = 0;
+            for (size_t i = 0; i < n; ++i) {
+                intrusive_ptr<const Value> pValue(vpValue[i]);
+                if (pValue->getType() == Date) 
+                    dateTotal += pValue->coerceToDate();
+                else 
+                    dateTotal += static_cast<long long>(pValue->coerceToDouble()*24*60*60*1000);
+            }
+
+            return Value::createDate(Date_t(dateTotal));
+        }
+
+        /*
+          We'll try to return the narrowest possible result value.  To do that
+          without creating intermediate Values, do the arithmetic for double
+          and integral types in parallel, tracking the current narrowest
+          type.
+         */
+        double doubleTotal = 0;
+        long long longTotal = 0;
+        BSONType totalType = NumberInt;
+        for(size_t i = 0; i < n; ++i) {
+            intrusive_ptr<const Value> pValue(vpValue[i]);
+
+            totalType = Value::getWidestNumeric(totalType, pValue->getType());
+            doubleTotal += pValue->coerceToDouble();
+            longTotal += pValue->coerceToLong();
+        }
+
+        if (totalType == NumberDouble)
+            return Value::createDouble(doubleTotal);
+        if (totalType == NumberLong)
+            return Value::createLong(longTotal);
+        return Value::createInt((int)longTotal);
+    }
+
+    const char *ExpressionAdd::getOpName() const {
+	return "$add";
+    }
+
+    intrusive_ptr<ExpressionNary> (*ExpressionAdd::getFactory() const)() {
+	return ExpressionAdd::create;
+    }
+
+    void ExpressionAdd::toBson(
+	BSONObjBuilder *pBuilder, const char *pOpName, unsigned depth) const {
+
+	if (pAdd)
+	    pAdd->toBson(pBuilder, pOpName, depth);
+	else
+	    ExpressionNary::toBson(pBuilder, pOpName, depth);
+    }
+
+
+    /* ------------------------- ExpressionAnd ----------------------------- */
+
+    ExpressionAnd::~ExpressionAnd() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionAnd::create() {
+        intrusive_ptr<ExpressionNary> pExpression(new ExpressionAnd());
+        return pExpression;
+    }
+
+    ExpressionAnd::ExpressionAnd():
+        ExpressionNary() {
+    }
+
+    intrusive_ptr<Expression> ExpressionAnd::optimize() {
+	/* optimize the conjunction as much as possible */
+	intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+
+	/* if the result isn't a conjunction, we can't do anything */
+	ExpressionAnd *pAnd = dynamic_cast<ExpressionAnd *>(pE.get());
+	if (!pAnd)
+	    return pE;
+
+	/*
+	  Check the last argument on the result; if it's not constant (as
+	  promised by ExpressionNary::optimize(),) then there's nothing
+	  we can do.
+	*/
+	const size_t n = pAnd->vpOperand.size();
+	intrusive_ptr<Expression> pLast(pAnd->vpOperand[n - 1]);
+	const ExpressionConstant *pConst =
+	    dynamic_cast<ExpressionConstant *>(pLast.get());
+	if (!pConst)
+	    return pE;
+
+	/*
+	  Evaluate and coerce the last argument to a boolean.  If it's false,
+	  then we can replace this entire expression.
+	 */
+	bool last = pLast->evaluate(intrusive_ptr<Document>())->coerceToBool();
+	if (!last) {
+	    intrusive_ptr<ExpressionConstant> pFinal(
+		ExpressionConstant::create(Value::getFalse()));
+	    return pFinal;
+	}
+
+	/*
+	  If we got here, the final operand was true, so we don't need it
+	  anymore.  If there was only one other operand, we don't need the
+	  conjunction either.  Note we still need to keep the promise that
+	  the result will be a boolean.
+	 */
+	if (n == 2) {
+	    intrusive_ptr<Expression> pFinal(
+		ExpressionCoerceToBool::create(pAnd->vpOperand[0]));
+	    return pFinal;
+	}
+
+	/*
+	  Remove the final "true" value, and return the new expression.
+
+	  CW TODO:
+	  Note that because of any implicit conversions, we may need to
+	  apply an implicit boolean conversion.
+	*/
+	pAnd->vpOperand.resize(n - 1);
+	return pE;
+    }
+
+    intrusive_ptr<const Value> ExpressionAnd::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        const size_t n = vpOperand.size();
+        for(size_t i = 0; i < n; ++i) {
+            intrusive_ptr<const Value> pValue(vpOperand[i]->evaluate(pDocument));
+            if (!pValue->coerceToBool())
+                return Value::getFalse();
+        }
+
+        return Value::getTrue();
+    }
+
+    const char *ExpressionAnd::getOpName() const {
+	return "$and";
+    }
+
+    void ExpressionAnd::toMatcherBson(
+	BSONObjBuilder *pBuilder, unsigned depth) const {
+	/*
+	  There are two patterns we can handle:
+	  (1) one or two comparisons on the same field: { a:{$gte:3, $lt:7} }
+	  (2) multiple field comparisons: {a:7, b:{$lte:6}, c:2}
+	    This can be recognized as a conjunction of a set of  range
+	    expressions.  Direct equality is a degenerate range expression;
+	    range expressions can be open-ended.
+	*/
+	assert(false && "unimplemented");
+    }
+
+    intrusive_ptr<ExpressionNary> (*ExpressionAnd::getFactory() const)() {
+	return ExpressionAnd::create;
+    }
+
+    /* -------------------- ExpressionCoerceToBool ------------------------- */
+
+    ExpressionCoerceToBool::~ExpressionCoerceToBool() {
+    }
+
+    intrusive_ptr<ExpressionCoerceToBool> ExpressionCoerceToBool::create(
+	const intrusive_ptr<Expression> &pExpression) {
+        intrusive_ptr<ExpressionCoerceToBool> pNew(
+	    new ExpressionCoerceToBool(pExpression));
+        return pNew;
+    }
+
+    ExpressionCoerceToBool::ExpressionCoerceToBool(
+	const intrusive_ptr<Expression> &pTheExpression):
+        Expression(),
+        pExpression(pTheExpression) {
+    }
+
+    intrusive_ptr<Expression> ExpressionCoerceToBool::optimize() {
+	/* optimize the operand */
+	pExpression = pExpression->optimize();
+
+	/* if the operand already produces a boolean, then we don't need this */
+	/* LATER - Expression to support a "typeof" query? */
+	Expression *pE = pExpression.get();
+	if (dynamic_cast<ExpressionAnd *>(pE) ||
+	    dynamic_cast<ExpressionOr *>(pE) ||
+	    dynamic_cast<ExpressionNot *>(pE) ||
+	    dynamic_cast<ExpressionCoerceToBool *>(pE))
+	    return pExpression;
+
+	return intrusive_ptr<Expression>(this);
+    }
+
+    intrusive_ptr<const Value> ExpressionCoerceToBool::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+
+	intrusive_ptr<const Value> pResult(pExpression->evaluate(pDocument));
+        bool b = pResult->coerceToBool();
+        if (b)
+            return Value::getTrue();
+        return Value::getFalse();
+    }
+
+    void ExpressionCoerceToBool::addToBsonObj(
+	BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+	assert(false && "not possible"); // no equivalent of this
+    }
+
+    void ExpressionCoerceToBool::addToBsonArray(
+	BSONArrayBuilder *pBuilder, unsigned depth) const {
+	assert(false && "not possible"); // no equivalent of this
+    }
+
+    /* ----------------------- ExpressionCompare --------------------------- */
+
+    ExpressionCompare::~ExpressionCompare() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCompare::createEq() {
+        intrusive_ptr<ExpressionCompare> pExpression(
+            new ExpressionCompare(EQ));
+        return pExpression;
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCompare::createNe() {
+        intrusive_ptr<ExpressionCompare> pExpression(
+            new ExpressionCompare(NE));
+        return pExpression;
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCompare::createGt() {
+        intrusive_ptr<ExpressionCompare> pExpression(
+            new ExpressionCompare(GT));
+        return pExpression;
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCompare::createGte() {
+        intrusive_ptr<ExpressionCompare> pExpression(
+            new ExpressionCompare(GTE));
+        return pExpression;
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCompare::createLt() {
+        intrusive_ptr<ExpressionCompare> pExpression(
+            new ExpressionCompare(LT));
+        return pExpression;
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCompare::createLte() {
+        intrusive_ptr<ExpressionCompare> pExpression(
+            new ExpressionCompare(LTE));
+        return pExpression;
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCompare::createCmp() {
+        intrusive_ptr<ExpressionCompare> pExpression(
+            new ExpressionCompare(CMP));
+        return pExpression;
+    }
+
+    ExpressionCompare::ExpressionCompare(CmpOp theCmpOp):
+        ExpressionNary(),
+        cmpOp(theCmpOp) {
+    }
+
+    void ExpressionCompare::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(2);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    /*
+      Lookup table for truth value returns
+    */
+    struct CmpLookup {
+	bool truthValue[3]; /* truth value for -1, 0, 1 */
+	Expression::CmpOp reverse; /* reverse comparison operator */
+	char name[5]; /* string name (w/trailing '\0') */
+    };
+    static const CmpLookup cmpLookup[7] = {
+        /*             -1      0      1      reverse          name   */
+        /* EQ  */ { { false, true,  false }, Expression::EQ,  "$eq"  },
+        /* NE  */ { { true,  false, true },  Expression::NE,  "$ne"  },
+        /* GT  */ { { false, false, true },  Expression::LTE, "$gt"  },
+        /* GTE */ { { false, true,  true },  Expression::LT,  "$gte" },
+        /* LT  */ { { true,  false, false }, Expression::GTE, "$lt"  },
+        /* LTE */ { { true,  true,  false }, Expression::GT,  "$lte" },
+        /* CMP */ { { false, false, false }, Expression::CMP, "$cmp" },
+    };
+
+    intrusive_ptr<Expression> ExpressionCompare::optimize() {
+	/* first optimize the comparison operands */
+	intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+
+	/*
+	  If the result of optimization is no longer a comparison, there's
+	  nothing more we can do.
+	*/
+	ExpressionCompare *pCmp = dynamic_cast<ExpressionCompare *>(pE.get());
+	if (!pCmp)
+	    return pE;
+
+	/* check to see if optimizing comparison operator is supported */
+	CmpOp newOp = pCmp->cmpOp;
+	if (newOp == CMP)
+	    return pE; // not reversible: there's nothing more we can do
+
+	/*
+	  There's one localized optimization we recognize:  a comparison
+	  between a field and a constant.  If we recognize that pattern,
+	  replace it with an ExpressionFieldRange.
+
+	  When looking for this pattern, note that the operands could appear
+	  in any order.  If we need to reverse the sense of the comparison to
+	  put it into the required canonical form, do so.
+	 */
+	intrusive_ptr<Expression> pLeft(pCmp->vpOperand[0]);
+	intrusive_ptr<Expression> pRight(pCmp->vpOperand[1]);
+	intrusive_ptr<ExpressionFieldPath> pFieldPath(
+	    dynamic_pointer_cast<ExpressionFieldPath>(pLeft));
+	intrusive_ptr<ExpressionConstant> pConstant;
+	if (pFieldPath.get()) {
+	    pConstant = dynamic_pointer_cast<ExpressionConstant>(pRight);
+	    if (!pConstant.get())
+		return pE; // there's nothing more we can do
+	}
+	else {
+	    /* if the first operand wasn't a path, see if it's a constant */
+	    pConstant = dynamic_pointer_cast<ExpressionConstant>(pLeft);
+	    if (!pConstant.get())
+		return pE; // there's nothing more we can do
+
+	    /* the left operand was a constant; see if the right is a path */
+	    pFieldPath = dynamic_pointer_cast<ExpressionFieldPath>(pRight);
+	    if (!pFieldPath.get())
+		return pE; // there's nothing more we can do
+
+	    /* these were not in canonical order, so reverse the sense */
+	    newOp = cmpLookup[newOp].reverse;
+	}
+
+	return ExpressionFieldRange::create(
+	    pFieldPath, newOp, pConstant->getValue());
+    }
+
+    intrusive_ptr<const Value> ExpressionCompare::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(2);
+        intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+        intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+
+        BSONType leftType = pLeft->getType();
+        BSONType rightType = pRight->getType();
+	uassert(15994, str::stream() << getOpName() <<
+		":  no automatic conversion for types " <<
+		leftType << " and " << rightType,
+		leftType == rightType);
+        // CW TODO at least for now.  later, handle automatic conversions
+
+        int cmp = 0;
+        switch(leftType) {
+        case NumberDouble: {
+            double left = pLeft->getDouble();
+            double right = pRight->getDouble();
+
+            if (left < right)
+                cmp = -1;
+            else if (left > right)
+                cmp = 1;
+            break;
+        }
+
+        case NumberInt: {
+            int left = pLeft->getInt();
+            int right = pRight->getInt();
+
+            if (left < right)
+                cmp = -1;
+            else if (left > right)
+                cmp = 1;
+            break;
+        }
+
+        case String: {
+            string left(pLeft->getString());
+            string right(pRight->getString());
+            cmp = signum(left.compare(right));
+            break;
+        }
+
+        default:
+	    uassert(15995, str::stream() <<
+		    "can't compare values of type " << leftType, false);
+            break;
+        }
+
+        if (cmpOp == CMP) {
+            switch(cmp) {
+            case -1:
+                return Value::getMinusOne();
+            case 0:
+                return Value::getZero();
+            case 1:
+                return Value::getOne();
+
+            default:
+                assert(false); // CW TODO internal error
+                return Value::getNull();
+            }
+        }
+
+        bool returnValue = cmpLookup[cmpOp].truthValue[cmp + 1];
+        if (returnValue)
+            return Value::getTrue();
+        return Value::getFalse();
+    }
+
+    const char *ExpressionCompare::getOpName() const {
+	return cmpLookup[cmpOp].name;
+    }
+
+    /* ----------------------- ExpressionCond ------------------------------ */
+
+    ExpressionCond::~ExpressionCond() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionCond::create() {
+        intrusive_ptr<ExpressionCond> pExpression(new ExpressionCond());
+        return pExpression;
+    }
+
+    ExpressionCond::ExpressionCond():
+        ExpressionNary() {
+    }
+
+    void ExpressionCond::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(3);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionCond::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(3);
+        intrusive_ptr<const Value> pCond(vpOperand[0]->evaluate(pDocument));
+	int idx = pCond->coerceToBool() ? 1 : 2;
+	return vpOperand[idx]->evaluate(pDocument);
+    }
+
+    const char *ExpressionCond::getOpName() const {
+	return "$cond";
+    }
+
+    /* ---------------------- ExpressionConstant --------------------------- */
+
+    ExpressionConstant::~ExpressionConstant() {
+    }
+
+    intrusive_ptr<ExpressionConstant> ExpressionConstant::createFromBsonElement(
+        BSONElement *pBsonElement) {
+        intrusive_ptr<ExpressionConstant> pEC(
+            new ExpressionConstant(pBsonElement));
+        return pEC;
+    }
+
+    ExpressionConstant::ExpressionConstant(BSONElement *pBsonElement):
+        pValue(Value::createFromBsonElement(pBsonElement)) {
+    }
+
+    intrusive_ptr<ExpressionConstant> ExpressionConstant::create(
+        const intrusive_ptr<const Value> &pValue) {
+        intrusive_ptr<ExpressionConstant> pEC(new ExpressionConstant(pValue));
+        return pEC;
+    }
+
+    ExpressionConstant::ExpressionConstant(
+	const intrusive_ptr<const Value> &pTheValue):
+        pValue(pTheValue) {
+    }
+
+
+    intrusive_ptr<Expression> ExpressionConstant::optimize() {
+	/* nothing to do */
+	return intrusive_ptr<Expression>(this);
+    }
+
+    intrusive_ptr<const Value> ExpressionConstant::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        return pValue;
+    }
+
+    void ExpressionConstant::addToBsonObj(
+	BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+
+	/*
+	  For depth greater than one, do the regular thing
+
+	  This will be one because any top level expression will actually
+	  be an operator node, so by the time we get to an expression
+	  constant, we're at level 1 (counting up as we go down the
+	  expression tree).
+
+	  See the comment below for more on why this happens.
+	*/
+	if (depth > 1) {
+	    pValue->addToBsonObj(pBuilder, fieldName);
+	    return;
+	}
+
+	/*
+	  If this happens at the top level, we don't have any direct way
+	  to express it.  However, we may need to if constant folding
+	  reduced expressions to constants, and we need to re-materialize
+	  the pipeline in order to ship it to a shard server.  This has
+	  forced the introduction of {$const: ...}.
+	 */
+	BSONObjBuilder constBuilder;
+	pValue->addToBsonObj(&constBuilder, "$const");
+	pBuilder->append(fieldName, constBuilder.done());
+    }
+
+    void ExpressionConstant::addToBsonArray(
+	BSONArrayBuilder *pBuilder, unsigned depth) const {
+	pValue->addToBsonArray(pBuilder);
+    }
+
+    const char *ExpressionConstant::getOpName() const {
+	assert(false); // this has no name
+	return NULL;
+    }
+
+    /* ---------------------- ExpressionDayOfMonth ------------------------- */
+
+    ExpressionDayOfMonth::~ExpressionDayOfMonth() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionDayOfMonth::create() {
+        intrusive_ptr<ExpressionDayOfMonth> pExpression(new ExpressionDayOfMonth());
+        return pExpression;
+    }
+
+    ExpressionDayOfMonth::ExpressionDayOfMonth():
+        ExpressionNary() {
+    }
+
+    void ExpressionDayOfMonth::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionDayOfMonth::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_mday); 
+    }
+
+    const char *ExpressionDayOfMonth::getOpName() const {
+	return "$dayOfMonth";
+    }
+
+    /* ------------------------- ExpressionDayOfWeek ----------------------------- */
+
+    ExpressionDayOfWeek::~ExpressionDayOfWeek() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionDayOfWeek::create() {
+        intrusive_ptr<ExpressionDayOfWeek> pExpression(new ExpressionDayOfWeek());
+        return pExpression;
+    }
+
+    ExpressionDayOfWeek::ExpressionDayOfWeek():
+        ExpressionNary() {
+    }
+
+    void ExpressionDayOfWeek::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionDayOfWeek::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_wday+1); // MySQL uses 1-7 tm uses 0-6
+    }
+
+    const char *ExpressionDayOfWeek::getOpName() const {
+	return "$dayOfWeek";
+    }
+
+    /* ------------------------- ExpressionDayOfYear ----------------------------- */
+
+    ExpressionDayOfYear::~ExpressionDayOfYear() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionDayOfYear::create() {
+        intrusive_ptr<ExpressionDayOfYear> pExpression(new ExpressionDayOfYear());
+        return pExpression;
+    }
+
+    ExpressionDayOfYear::ExpressionDayOfYear():
+        ExpressionNary() {
+    }
+
+    void ExpressionDayOfYear::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionDayOfYear::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_yday+1); // MySQL uses 1-366 tm uses 0-365
+    }
+
+    const char *ExpressionDayOfYear::getOpName() const {
+	return "$dayOfYear";
+    }
+
+    /* ----------------------- ExpressionDivide ---------------------------- */
+
+    ExpressionDivide::~ExpressionDivide() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionDivide::create() {
+        intrusive_ptr<ExpressionDivide> pExpression(new ExpressionDivide());
+        return pExpression;
+    }
+
+    ExpressionDivide::ExpressionDivide():
+        ExpressionNary() {
+    }
+
+    void ExpressionDivide::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(2);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionDivide::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(2);
+        intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+        intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+
+        double right = pRight->coerceToDouble();
+	if (right == 0)
+	    return Value::getUndefined();
+
+        double left = pLeft->coerceToDouble();
+
+        return Value::createDouble(left / right);
+    }
+
+    const char *ExpressionDivide::getOpName() const {
+	return "$divide";
+    }
+
+    /* ---------------------- ExpressionObject --------------------------- */
+
+    ExpressionObject::~ExpressionObject() {
+    }
+
+    intrusive_ptr<ExpressionObject> ExpressionObject::create() {
+        intrusive_ptr<ExpressionObject> pExpression(new ExpressionObject());
+        return pExpression;
+    }
+
+    ExpressionObject::ExpressionObject():
+	excludePaths(false),
+	path(),
+        vFieldName(),
+        vpExpression() {
+    }
+
+    intrusive_ptr<Expression> ExpressionObject::optimize() {
+	const size_t n = vpExpression.size();
+	for(size_t i = 0; i < n; ++i) {
+	    intrusive_ptr<Expression> pE(vpExpression[i]->optimize());
+	    vpExpression[i] = pE;
+	}
+
+	return intrusive_ptr<Expression>(this);
+    }
+
+    void ExpressionObject::addToDocument(
+	const intrusive_ptr<Document> &pResult,
+        const intrusive_ptr<Document> &pDocument) const {
+	const size_t pathSize = path.size();
+	set<string>::const_iterator end(path.end());
+
+	/*
+	  Take care of inclusions or exclusions.  Note that _id is special,
+	  that that it is always included, unless it is specifically excluded.
+	  we use excludeId for that in case excludePaths if false, which means
+	  to include paths.
+	*/
+	if (pathSize) {
+	    auto_ptr<FieldIterator> pIter(pDocument->createFieldIterator());
+	    if (excludePaths) {
+		while(pIter->more()) {
+		    pair<string, intrusive_ptr<const Value> > field(pIter->next());
+
+		    /*
+		      If the field in the document is not in the exclusion set,
+		      add it to the result document.
+
+		      Note that exclusions are only allowed on leaves, so we
+		      can assume we don't have to descend recursively here.
+		     */
+		    if (path.find(field.first) != end)
+			continue; // we found it, so don't add it
+
+		    pResult->addField(field.first, field.second);
+		}
+	    }
+	    else { /* !excludePaths */
+		while(pIter->more()) {
+		    pair<string, intrusive_ptr<const Value> > field(
+			pIter->next());
+		    /*
+		      If the field in the document is in the inclusion set,
+		      add it to the result document.  Or, if we're not
+		      excluding _id, and it is _id, include it.
+
+		      Note that this could be an inclusion along a pathway,
+		      so we look for an ExpressionObject in vpExpression; when
+		      we find one, we populate the result with the evaluation
+		      of that on the nested object, yielding relative paths.
+		      This also allows us to handle intermediate arrays; if we
+		      encounter one, we repeat this for each array element.
+		     */
+		    if (path.find(field.first) != end) {
+			/* find the Expression */
+			const size_t n = vFieldName.size();
+			size_t i;
+			Expression *pE = NULL;
+			for(i = 0; i < n; ++i) {
+			    if (field.first.compare(vFieldName[i]) == 0) {
+				pE = vpExpression[i].get();
+				break;
+			    }
+			}
+
+			/*
+			  If we didn't find an expression, it's the last path
+			  element to include.
+			*/
+			if (!pE) {
+			    pResult->addField(field.first, field.second);
+			    continue;
+			}
+
+			ExpressionObject *pChild =
+			    dynamic_cast<ExpressionObject *>(pE);
+			assert(pChild);
+
+			/*
+			  Check on the type of the result object.  If it's an
+			  object, just walk down into that recursively, and
+			  add it to the result.
+			*/
+			BSONType valueType = field.second->getType();
+			if (valueType == Object) {
+			    intrusive_ptr<Document> pD(
+				pChild->evaluateDocument(
+				    field.second->getDocument()));
+			    pResult->addField(vFieldName[i],
+					      Value::createDocument(pD));
+			}
+			else if (valueType == Array) {
+			    /*
+			      If it's an array, we have to do the same thing,
+			      but to each array element.  Then, add the array
+			      of results to the current document.
+			    */
+			    vector<intrusive_ptr<const Value> > result;
+			    intrusive_ptr<ValueIterator> pVI(
+				field.second->getArray());
+			    while(pVI->more()) {
+				intrusive_ptr<Document> pD(
+				    pChild->evaluateDocument(
+					pVI->next()->getDocument()));
+				result.push_back(Value::createDocument(pD));
+			    }
+
+			    pResult->addField(vFieldName[i],
+					      Value::createArray(result));
+			}
+		    }
+		}
+	    }
+	}
+
+	/* add any remaining fields we haven't already taken care of */
+        const size_t n = vFieldName.size();
+        for(size_t i = 0; i < n; ++i) {
+	    string fieldName(vFieldName[i]);
+
+	    /* if we've already dealt with this field, above, do nothing */
+	    if (path.find(fieldName) != end)
+		continue;
+
+	    intrusive_ptr<const Value> pValue(
+		vpExpression[i]->evaluate(pDocument));
+
+	    /*
+	      Don't add non-existent values (note:  different from NULL);
+	      this is consistent with existing selection syntax which doesn't
+	      force the appearnance of non-existent fields.
+	    */
+	    if (pValue->getType() == Undefined)
+		continue;
+
+	    pResult->addField(fieldName, pValue);
+        }
+    }
+
+    size_t ExpressionObject::getSizeHint(
+	const intrusive_ptr<Document> &pDocument) const {
+	size_t sizeHint = pDocument->getFieldCount();
+	const size_t pathSize = path.size();
+	if (!excludePaths)
+	    sizeHint += pathSize;
+	else {
+	    size_t excludeCount = pathSize;
+	    if (sizeHint > excludeCount)
+		sizeHint -= excludeCount;
+	    else
+		sizeHint = 0;
+	}
+
+	/* account for the additional computed fields */
+	sizeHint += vFieldName.size();
+
+	return sizeHint;
+    }
+
+    intrusive_ptr<Document> ExpressionObject::evaluateDocument(
+        const intrusive_ptr<Document> &pDocument) const {
+	/* create and populate the result */
+        intrusive_ptr<Document> pResult(
+	    Document::create(getSizeHint(pDocument)));
+	addToDocument(pResult, pDocument);
+        return pResult;
+    }
+
+    intrusive_ptr<const Value> ExpressionObject::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	return Value::createDocument(evaluateDocument(pDocument));
+    }
+
+    void ExpressionObject::addField(const string &fieldName,
+				    const intrusive_ptr<Expression> &pExpression) {
+	/* must have an expression */
+	assert(pExpression.get());
+
+	/* parse the field path */
+	FieldPath fieldPath(fieldName);
+	uassert(16008, str::stream() <<
+		"an expression object's field names cannot be field paths (at \"" <<
+		fieldName << "\")", fieldPath.getPathLength() == 1);
+
+	/* make sure it isn't a name we've included or excluded */
+	set<string>::iterator ex(path.find(fieldName));
+	uassert(16009, str::stream() <<
+		"can't add a field to an object expression that has already been excluded (at \"" <<
+		fieldName << "\")", ex == path.end());
+
+	/* make sure it isn't a name we've already got */
+	const size_t n = vFieldName.size();
+	for(size_t i = 0; i < n; ++i) {
+	    uassert(16010, str::stream() <<
+		    "can't add the same field to an object expression more than once (at \"" <<
+		    fieldName << "\")",
+		    fieldName.compare(vFieldName[i]) != 0);
+	}
+
+	vFieldName.push_back(fieldName);
+	vpExpression.push_back(pExpression);
+    }
+
+    void ExpressionObject::includePath(
+	const FieldPath *pPath, size_t pathi, size_t pathn, bool excludeLast) {
+
+	/* get the current path field name */
+	string fieldName(pPath->getFieldName(pathi));
+	uassert(16011,
+		"an object expression can't include an empty field-name",
+		fieldName.length());
+
+	const size_t pathCount = path.size();
+
+	/* if this is the leaf-most object, stop */
+	if (pathi == pathn - 1) {
+	    /*
+	      Make sure the exclusion configuration of this node matches
+	      the requested result.  Or, that this is the first (determining)
+	      specification.
+	    */
+	    uassert(16012, str::stream() <<
+		    "incompatible exclusion for \"" <<
+		    pPath->getPath(false) <<
+		    "\" because of a prior inclusion that includes a common sub-path",
+		    ((excludePaths == excludeLast) || !pathCount));
+
+	    excludePaths = excludeLast; // if (!pathCount), set this
+	    path.insert(fieldName);
+	    return;
+	}
+
+	/* this level had better be about inclusions */
+	uassert(16013, str::stream() <<
+		"incompatible inclusion for \"" << pPath->getPath(false) <<
+		"\" because of a prior exclusion that includes a common sub-path",
+		!excludePaths);
+
+	/* see if we already know about this field */
+	const size_t n = vFieldName.size();
+	size_t i;
+	for(i = 0; i < n; ++i) {
+	    if (fieldName.compare(vFieldName[i]) == 0)
+		break;
+	}
+
+	/* find the right object, and continue */
+	ExpressionObject *pChild;
+	if (i < n) {
+	    /* the intermediate child already exists */
+	    pChild = dynamic_cast<ExpressionObject *>(vpExpression[i].get());
+	    assert(pChild);
+	}
+	else {
+	    /*
+	      If we get here, the intervening child isn't already there,
+	      so create it.
+	    */
+	    intrusive_ptr<ExpressionObject> pSharedChild(
+		ExpressionObject::create());
+	    path.insert(fieldName);
+	    vFieldName.push_back(fieldName);
+	    vpExpression.push_back(pSharedChild);
+	    pChild = pSharedChild.get();
+	}
+
+	// LATER CW TODO turn this into a loop
+	pChild->includePath(pPath, pathi + 1, pathn, excludeLast);
+    }
+
+    void ExpressionObject::includePath(const string &theFieldPath) {
+	/* parse the field path */
+	FieldPath fieldPath(theFieldPath);
+	includePath(&fieldPath, 0, fieldPath.getPathLength(), false);
+    }
+
+    void ExpressionObject::excludePath(const string &theFieldPath) {
+	/* parse the field path */
+	FieldPath fieldPath(theFieldPath);
+	includePath(&fieldPath, 0, fieldPath.getPathLength(), true);
+    }
+
+    intrusive_ptr<Expression> ExpressionObject::getField(
+	const string &fieldName) const {
+	const size_t n = vFieldName.size();
+	for(size_t i = 0; i < n; ++i) {
+	    if (fieldName.compare(vFieldName[i]) == 0)
+		return vpExpression[i];
+	}
+
+	/* if we got here, we didn't find it */
+	return intrusive_ptr<Expression>();
+    }
+
+    void ExpressionObject::emitPaths(
+	BSONObjBuilder *pBuilder, vector<string> *pvPath) const {
+	if (!path.size())
+	    return;
+	
+	/* we use these for loops */
+	const size_t nField = vFieldName.size();
+	const size_t nPath = pvPath->size();
+
+	/*
+	  We can iterate over the inclusion/exclusion paths in their
+	  (random) set order because they don't affect the order that
+	  fields are listed in the result.  That comes from the underlying
+	  Document they are fetched from.
+	 */
+	for(set<string>::const_iterator end(path.end()),
+		iter(path.begin()); iter != end; ++iter) {
+
+	    /* find the matching field description */
+	    size_t iField = 0;
+	    for(; iField < nField; ++iField) {
+		if (iter->compare(vFieldName[iField]) == 0)
+		    break;
+	    }
+
+	    if (iField == nField) {
+		/*
+		  If we didn't find a matching field description, this is the
+		  leaf, so add the path.
+		*/
+		stringstream ss;
+
+		for(size_t iPath = 0; iPath < nPath; ++iPath)
+		    ss << (*pvPath)[iPath] << ".";
+		ss << *iter;
+
+		pBuilder->append(ss.str(), !excludePaths);
+	    }
+	    else {
+		/*
+		  If we found a matching field description, then we need to
+		  descend into the next level.
+		*/
+		Expression *pE = vpExpression[iField].get();
+		ExpressionObject *pEO = dynamic_cast<ExpressionObject *>(pE);
+		assert(pEO);
+
+		/*
+		  Add the current field name to the path being built up,
+		  then go down into the next level.
+		 */
+		PathPusher pathPusher(pvPath, vFieldName[iField]);
+		pEO->emitPaths(pBuilder, pvPath);
+	    }
+	}
+    }
+
+    void ExpressionObject::documentToBson(
+	BSONObjBuilder *pBuilder, unsigned depth) const {
+
+	/* emit any inclusion/exclusion paths */
+	vector<string> vPath;
+	emitPaths(pBuilder, &vPath);
+
+	/* then add any expressions */
+	const size_t nField = vFieldName.size();
+	const set<string>::const_iterator pathEnd(path.end());
+	for(size_t iField = 0; iField < nField; ++iField) {
+	    string fieldName(vFieldName[iField]);
+
+	    /* if we already took care of this, don't repeat it */
+	    if (path.find(fieldName) != pathEnd)
+		continue;
+
+	    vpExpression[iField]->addToBsonObj(pBuilder, fieldName, depth + 1);
+	}
+    }
+
+    void ExpressionObject::addToBsonObj(
+	BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+
+	BSONObjBuilder objBuilder;
+	documentToBson(&objBuilder, depth);
+	pBuilder->append(fieldName, objBuilder.done());
+    }
+
+    void ExpressionObject::addToBsonArray(
+	BSONArrayBuilder *pBuilder, unsigned depth) const {
+
+	BSONObjBuilder objBuilder;
+	documentToBson(&objBuilder, depth);
+	pBuilder->append(objBuilder.done());
+    }
+
+    /* --------------------- ExpressionFieldPath --------------------------- */
+
+    ExpressionFieldPath::~ExpressionFieldPath() {
+    }
+
+    intrusive_ptr<ExpressionFieldPath> ExpressionFieldPath::create(
+        const string &fieldPath) {
+        intrusive_ptr<ExpressionFieldPath> pExpression(
+            new ExpressionFieldPath(fieldPath));
+        return pExpression;
+    }
+
+    ExpressionFieldPath::ExpressionFieldPath(
+	const string &theFieldPath):
+        fieldPath(theFieldPath) {
+    }
+
+    intrusive_ptr<Expression> ExpressionFieldPath::optimize() {
+	/* nothing can be done for these */
+	return intrusive_ptr<Expression>(this);
+    }
+
+    intrusive_ptr<const Value> ExpressionFieldPath::evaluatePath(
+	size_t index, const size_t pathLength,
+	intrusive_ptr<Document> pDocument) const {
+        intrusive_ptr<const Value> pValue; /* the return value */
+
+	pValue = pDocument->getValue(fieldPath.getFieldName(index));
+
+	/* if the field doesn't exist, quit with an undefined value */
+	if (!pValue.get())
+	    return Value::getUndefined();
+
+	/* if we've hit the end of the path, stop */
+	++index;
+	if (index >= pathLength)
+	    return pValue;
+
+	/*
+	  We're diving deeper.  If the value was null, return null.
+	*/
+	BSONType type = pValue->getType();
+	if ((type == Undefined) || (type == jstNULL))
+	    return Value::getUndefined();
+
+	if (type == Object) {
+	    /* extract from the next level down */
+	    return evaluatePath(index, pathLength, pValue->getDocument());
+	}
+
+	if (type == Array) {
+	    /*
+	      We're going to repeat this for each member of the array,
+	      building up a new array as we go.
+	    */
+	    vector<intrusive_ptr<const Value> > result;
+	    intrusive_ptr<ValueIterator> pIter(pValue->getArray());
+	    while(pIter->more()) {
+		intrusive_ptr<const Value> pItem(pIter->next());
+		BSONType iType = pItem->getType();
+		if ((iType == Undefined) || (iType == jstNULL)) {
+		    result.push_back(pItem);
+		    continue;
+		}
+
+		uassert(16014, str::stream() << 
+			"the element \"" << fieldPath.getFieldName(index) <<
+			"\" along the dotted path \"" <<
+			fieldPath.getPath(false) <<
+			"\" is not an object, and cannot be navigated",
+			iType == Object);
+		intrusive_ptr<const Value> itemResult(
+		    evaluatePath(index, pathLength, pItem->getDocument()));
+		result.push_back(itemResult);
+	    }
+
+	    return Value::createArray(result);
+	}
+
+	uassert(16015, str::stream() <<
+		"can't navigate into value of type " << type <<
+		"at \"" << fieldPath.getFieldName(index) <<
+		"\" in dotted path \"" << fieldPath.getPath(false),
+		false);
+	return intrusive_ptr<const Value>();
+    }
+
+    intrusive_ptr<const Value> ExpressionFieldPath::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	return evaluatePath(0, fieldPath.getPathLength(), pDocument);
+    }
+
+    void ExpressionFieldPath::addToBsonObj(
+	BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+	pBuilder->append(fieldName, fieldPath.getPath(true));
+    }
+
+    void ExpressionFieldPath::addToBsonArray(
+	BSONArrayBuilder *pBuilder, unsigned depth) const {
+	pBuilder->append(getFieldPath(true));
+    }
+
+    /* --------------------- ExpressionFieldPath --------------------------- */
+
+    ExpressionFieldRange::~ExpressionFieldRange() {
+    }
+
+    intrusive_ptr<Expression> ExpressionFieldRange::optimize() {
+	/* if there is no range to match, this will never evaluate true */
+	if (!pRange.get())
+	    return ExpressionConstant::create(Value::getFalse());
+
+	/*
+	  If we ended up with a double un-ended range, anything matches.  I
+	  don't know how that can happen, given intersect()'s interface, but
+	  here it is, just in case.
+	*/
+	if (!pRange->pBottom.get() && !pRange->pTop.get())
+	    return ExpressionConstant::create(Value::getTrue());
+
+	/*
+	  In all other cases, we have to test candidate values.  The
+	  intersect() method has already optimized those tests, so there
+	  aren't any more optimizations to look for here.
+	*/
+	return intrusive_ptr<Expression>(this);
+    }
+
+    intrusive_ptr<const Value> ExpressionFieldRange::evaluate(
+	const intrusive_ptr<Document> &pDocument) const {
+	/* if there's no range, there can't be a match */
+	if (!pRange.get())
+	    return Value::getFalse();
+
+	/* get the value of the specified field */
+	intrusive_ptr<const Value> pValue(pFieldPath->evaluate(pDocument));
+
+	/* see if it fits within any of the ranges */
+	if (pRange->contains(pValue))
+	    return Value::getTrue();
+
+	return Value::getFalse();
+    }
+
+    void ExpressionFieldRange::addToBson(
+	Builder *pBuilder, unsigned depth) const {
+	if (!pRange.get()) {
+	    /* nothing will satisfy this predicate */
+	    pBuilder->append(false);
+	    return;
+	}
+
+	if (!pRange->pTop.get() && !pRange->pBottom.get()) {
+	    /* any value will satisfy this predicate */
+	    pBuilder->append(true);
+	    return;
+	}
+
+	if (pRange->pTop.get() == pRange->pBottom.get()) {
+	    BSONArrayBuilder operands;
+	    pFieldPath->addToBsonArray(&operands, depth);
+	    pRange->pTop->addToBsonArray(&operands);
+	    
+	    BSONObjBuilder equals;
+	    equals.append("$eq", operands.arr());
+	    pBuilder->append(&equals);
+	    return;
+	}
+
+	BSONObjBuilder leftOperator;
+	if (pRange->pBottom.get()) {
+	    BSONArrayBuilder leftOperands;
+	    pFieldPath->addToBsonArray(&leftOperands, depth);
+	    pRange->pBottom->addToBsonArray(&leftOperands);
+	    leftOperator.append(
+		(pRange->bottomOpen ? "$gt" : "$gte"),
+		leftOperands.arr());
+
+	    if (!pRange->pTop.get()) {
+		pBuilder->append(&leftOperator);
+		return;
+	    }
+	}
+
+	BSONObjBuilder rightOperator;
+	if (pRange->pTop.get()) {
+	    BSONArrayBuilder rightOperands;
+	    pFieldPath->addToBsonArray(&rightOperands, depth);
+	    pRange->pTop->addToBsonArray(&rightOperands);
+	    rightOperator.append(
+		(pRange->topOpen ? "$lt" : "$lte"),
+		rightOperands.arr());
+
+	    if (!pRange->pBottom.get()) {
+		pBuilder->append(&rightOperator);
+		return;
+	    }
+	}
+
+	BSONArrayBuilder andOperands;
+	andOperands.append(leftOperator.done());
+	andOperands.append(rightOperator.done());
+	BSONObjBuilder andOperator;
+	andOperator.append("$and", andOperands.arr());
+	pBuilder->append(&andOperator);
+    }
+
+    void ExpressionFieldRange::addToBsonObj(
+	BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+	BuilderObj builder(pBuilder, fieldName);
+	addToBson(&builder, depth);
+    }
+
+    void ExpressionFieldRange::addToBsonArray(
+	BSONArrayBuilder *pBuilder, unsigned depth) const {
+	BuilderArray builder(pBuilder);
+	addToBson(&builder, depth);
+    }
+
+    void ExpressionFieldRange::toMatcherBson(
+	BSONObjBuilder *pBuilder, unsigned depth) const {
+	assert(pRange.get()); // otherwise, we can't do anything
+
+	/* if there are no endpoints, then every value is accepted */
+	if (!pRange->pBottom.get() && !pRange->pTop.get())
+	    return; // nothing to add to the predicate
+
+	/* we're going to need the field path */
+	string fieldPath(pFieldPath->getFieldPath(false));
+
+	BSONObjBuilder range;
+	if (pRange->pBottom.get()) {
+	    /* the test for equality doesn't generate a subobject */
+	    if (pRange->pBottom.get() == pRange->pTop.get()) {
+		pRange->pBottom->addToBsonObj(pBuilder, fieldPath);
+		return;
+	    }
+
+	    pRange->pBottom->addToBsonObj(
+		pBuilder, (pRange->bottomOpen ? "$gt" : "$gte"));
+	}
+
+	if (pRange->pTop.get()) {
+	    pRange->pTop->addToBsonObj(
+		pBuilder, (pRange->topOpen ? "$lt" : "$lte"));
+	}
+
+	pBuilder->append(fieldPath, range.done());
+    }
+
+    intrusive_ptr<ExpressionFieldRange> ExpressionFieldRange::create(
+	const intrusive_ptr<ExpressionFieldPath> &pFieldPath, CmpOp cmpOp,
+	const intrusive_ptr<const Value> &pValue) {
+	intrusive_ptr<ExpressionFieldRange> pE(
+	    new ExpressionFieldRange(pFieldPath, cmpOp, pValue));
+	return pE;
+    }
+
+    ExpressionFieldRange::ExpressionFieldRange(
+	const intrusive_ptr<ExpressionFieldPath> &pTheFieldPath, CmpOp cmpOp,
+	const intrusive_ptr<const Value> &pValue):
+        pFieldPath(pTheFieldPath),
+	pRange(new Range(cmpOp, pValue)) {
+    }
+
+    void ExpressionFieldRange::intersect(
+	CmpOp cmpOp, const intrusive_ptr<const Value> &pValue) {
+
+	/* create the new range */
+	scoped_ptr<Range> pNew(new Range(cmpOp, pValue));
+
+	/*
+	  Go through the range list.  For every range, either add the
+	  intersection of that to the range list, or if there is none, the
+	  original range.  This has the effect of restricting overlapping
+	  ranges, but leaving non-overlapping ones as-is.
+	*/
+	pRange.reset(pRange->intersect(pNew.get()));
+    }
+
+    ExpressionFieldRange::Range::Range(
+	CmpOp cmpOp, const intrusive_ptr<const Value> &pValue):
+	bottomOpen(false),
+	topOpen(false),
+	pBottom(),
+	pTop() {
+	switch(cmpOp) {
+	case NE:
+	    bottomOpen = topOpen = true;
+	    /* FALLTHROUGH */
+	case EQ:
+	    pBottom = pTop = pValue;
+	    break;
+
+	case GT:
+	    bottomOpen = true;
+	    /* FALLTHROUGH */
+	case GTE:
+	    topOpen = true;
+	    pBottom = pValue;
+	    break;
+
+	case LT:
+	    topOpen = true;
+	    /* FALLTHROUGH */
+	case LTE:
+	    bottomOpen = true;
+	    pTop = pValue;
+	    break;
+
+	case CMP:
+	    assert(false); // not allowed
+	    break;
+	}
+    }
+
+    ExpressionFieldRange::Range::Range(const Range &rRange):
+	bottomOpen(rRange.bottomOpen),
+	topOpen(rRange.topOpen),
+	pBottom(rRange.pBottom),
+	pTop(rRange.pTop) {
+    }
+
+    ExpressionFieldRange::Range::Range(
+	const intrusive_ptr<const Value> &pTheBottom, bool theBottomOpen,
+	const intrusive_ptr<const Value> &pTheTop, bool theTopOpen):
+	bottomOpen(theBottomOpen),
+	topOpen(theTopOpen),
+	pBottom(pTheBottom),
+	pTop(pTheTop) {
+    }
+	
+    ExpressionFieldRange::Range *ExpressionFieldRange::Range::intersect(
+	const Range *pRange) const {
+	/*
+	  Find the max of the bottom end of the ranges.
+
+	  Start by assuming the maximum is from pRange.  Then, if we have
+	  values of our own, see if they're greater.
+	*/
+	intrusive_ptr<const Value> pMaxBottom(pRange->pBottom);
+	bool maxBottomOpen = pRange->bottomOpen;
+	if (pBottom.get()) {
+	    if (!pRange->pBottom.get()) {
+		pMaxBottom = pBottom;
+		maxBottomOpen = bottomOpen;
+	    }
+	    else {
+		const int cmp = Value::compare(pBottom, pRange->pBottom);
+		if (cmp == 0)
+		    maxBottomOpen = bottomOpen || pRange->bottomOpen;
+		else if (cmp > 0) {
+		    pMaxBottom = pBottom;
+		    maxBottomOpen = bottomOpen;
+		}
+	    }
+	}
+
+	/*
+	  Find the minimum of the tops of the ranges.
+
+	  Start by assuming the minimum is from pRange.  Then, if we have
+	  values of our own, see if they are less.
+	*/
+	intrusive_ptr<const Value> pMinTop(pRange->pTop);
+	bool minTopOpen = pRange->topOpen;
+	if (pTop.get()) {
+	    if (!pRange->pTop.get()) {
+		pMinTop = pTop;
+		minTopOpen = topOpen;
+	    }
+	    else {
+		const int cmp = Value::compare(pTop, pRange->pTop);
+		if (cmp == 0)
+		    minTopOpen = topOpen || pRange->topOpen;
+		else if (cmp < 0) {
+		    pMinTop = pTop;
+		    minTopOpen = topOpen;
+		}
+	    }
+	}
+
+	/*
+	  If the intersections didn't create a disjoint set, create the
+	  new range.
+	*/
+	if (Value::compare(pMaxBottom, pMinTop) <= 0)
+	    return new Range(pMaxBottom, maxBottomOpen, pMinTop, minTopOpen);
+
+	/* if we got here, the intersection is empty */
+	return NULL;
+    }
+
+    bool ExpressionFieldRange::Range::contains(
+	const intrusive_ptr<const Value> &pValue) const {
+	if (pBottom.get()) {
+	    const int cmp = Value::compare(pValue, pBottom);
+	    if (cmp < 0)
+		return false;
+	    if (bottomOpen && (cmp == 0))
+		return false;
+	}
+
+	if (pTop.get()) {
+	    const int cmp = Value::compare(pValue, pTop);
+	    if (cmp > 0)
+		return false;
+	    if (topOpen && (cmp == 0))
+		return false;
+	}
+
+	return true;
+    }
+
+    /* ------------------------- ExpressionMinute ----------------------------- */
+
+    ExpressionMinute::~ExpressionMinute() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionMinute::create() {
+        intrusive_ptr<ExpressionMinute> pExpression(new ExpressionMinute());
+        return pExpression;
+    }
+
+    ExpressionMinute::ExpressionMinute():
+        ExpressionNary() {
+    }
+
+    void ExpressionMinute::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionMinute::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_min);
+    }
+
+    const char *ExpressionMinute::getOpName() const {
+	return "$minute";
+    }
+
+    /* ----------------------- ExpressionMod ---------------------------- */
+
+    ExpressionMod::~ExpressionMod() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionMod::create() {
+        intrusive_ptr<ExpressionMod> pExpression(new ExpressionMod());
+        return pExpression;
+    }
+
+    ExpressionMod::ExpressionMod():
+        ExpressionNary() {
+    }
+
+    void ExpressionMod::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(2);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionMod::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        BSONType productType;
+	checkArgCount(2);
+        intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+        intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+
+        productType = Value::getWidestNumeric(pRight->getType(), pLeft->getType());
+
+        long long right = pRight->coerceToLong();
+	if (right == 0)
+	    return Value::getUndefined();
+
+        long long left = pLeft->coerceToLong();
+        if (productType == NumberLong)
+            return Value::createLong(left % right);
+        return Value::createInt((int)left % right);
+    }
+
+    const char *ExpressionMod::getOpName() const {
+	return "$mod";
+    }
+
+    /* ------------------------- ExpressionMonth ----------------------------- */
+
+    ExpressionMonth::~ExpressionMonth() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionMonth::create() {
+        intrusive_ptr<ExpressionMonth> pExpression(new ExpressionMonth());
+        return pExpression;
+    }
+
+    ExpressionMonth::ExpressionMonth():
+        ExpressionNary() {
+    }
+
+    void ExpressionMonth::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionMonth::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_mon+1); // MySQL uses 1-12 tm uses 0-11
+    }
+
+    const char *ExpressionMonth::getOpName() const {
+	return "$month";
+    }
+
+    /* ------------------------- ExpressionMultiply ----------------------------- */
+
+    ExpressionMultiply::~ExpressionMultiply() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionMultiply::create() {
+        intrusive_ptr<ExpressionMultiply> pExpression(new ExpressionMultiply());
+        return pExpression;
+    }
+
+    ExpressionMultiply::ExpressionMultiply():
+        ExpressionNary() {
+    }
+
+    intrusive_ptr<const Value> ExpressionMultiply::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        /*
+          We'll try to return the narrowest possible result value.  To do that
+          without creating intermediate Values, do the arithmetic for double
+          and integral types in parallel, tracking the current narrowest
+          type.
+         */
+        double doubleProduct = 1;
+        long long longProduct = 1;
+        BSONType productType = NumberInt;
+
+        const size_t n = vpOperand.size();
+        for(size_t i = 0; i < n; ++i) {
+            intrusive_ptr<const Value> pValue(vpOperand[i]->evaluate(pDocument));
+
+            productType = Value::getWidestNumeric(productType, pValue->getType());
+            doubleProduct *= pValue->coerceToDouble();
+            longProduct *= pValue->coerceToLong();
+        }
+
+        if (productType == NumberDouble)
+            return Value::createDouble(doubleProduct);
+        if (productType == NumberLong)
+            return Value::createLong(longProduct);
+        return Value::createInt((int)longProduct);
+    }
+
+    const char *ExpressionMultiply::getOpName() const {
+    return "$multiply";
+    }
+
+    intrusive_ptr<ExpressionNary> (*ExpressionMultiply::getFactory() const)() {
+    return ExpressionMultiply::create;
+    }
+
+    /* ------------------------- ExpressionHour ----------------------------- */
+
+    ExpressionHour::~ExpressionHour() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionHour::create() {
+        intrusive_ptr<ExpressionHour> pExpression(new ExpressionHour());
+        return pExpression;
+    }
+
+    ExpressionHour::ExpressionHour():
+        ExpressionNary() {
+    }
+
+    void ExpressionHour::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionHour::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_hour);
+    }
+
+    const char *ExpressionHour::getOpName() const {
+	return "$hour";
+    }
+
+    /* ----------------------- ExpressionIfNull ---------------------------- */
+
+    ExpressionIfNull::~ExpressionIfNull() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionIfNull::create() {
+        intrusive_ptr<ExpressionIfNull> pExpression(new ExpressionIfNull());
+        return pExpression;
+    }
+
+    ExpressionIfNull::ExpressionIfNull():
+        ExpressionNary() {
+    }
+
+    void ExpressionIfNull::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(2);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionIfNull::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(2);
+        intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+	BSONType leftType = pLeft->getType();
+
+        if ((leftType != Undefined) && (leftType != jstNULL))
+            return pLeft;
+
+        intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+        return pRight;
+    }
+
+    const char *ExpressionIfNull::getOpName() const {
+	return "$ifNull";
+    }
+
+    /* ------------------------ ExpressionNary ----------------------------- */
+
+    ExpressionNary::ExpressionNary():
+        vpOperand() {
+    }
+
+    intrusive_ptr<Expression> ExpressionNary::optimize() {
+	unsigned constCount = 0; // count of constant operands
+	unsigned stringCount = 0; // count of constant string operands
+	const size_t n = vpOperand.size();
+	for(size_t i = 0; i < n; ++i) {
+	    intrusive_ptr<Expression> pNew(vpOperand[i]->optimize());
+
+	    /* subsitute the optimized expression */
+	    vpOperand[i] = pNew;
+
+	    /* check to see if the result was a constant */
+	    const ExpressionConstant *pConst =
+		dynamic_cast<ExpressionConstant *>(pNew.get());
+	    if (pConst) {
+		++constCount;
+		if (pConst->getValue()->getType() == String)
+		    ++stringCount;
+	    }
+	}
+
+	/*
+	  If all the operands are constant, we can replace this expression
+	  with a constant.  We can find the value by evaluating this
+	  expression over a NULL Document because evaluating the
+	  ExpressionConstant never refers to the argument Document.
+	*/
+	if (constCount == n) {
+	    intrusive_ptr<const Value> pResult(
+		evaluate(intrusive_ptr<Document>()));
+	    intrusive_ptr<Expression> pReplacement(
+		ExpressionConstant::create(pResult));
+	    return pReplacement;
+	}
+
+	/*
+	  If there are any strings, we can't re-arrange anything, so stop
+	  now.
+
+	  LATER:  we could concatenate adjacent strings as a special case.
+	 */
+	if (stringCount)
+	    return intrusive_ptr<Expression>(this);
+
+	/*
+	  If there's no more than one constant, then we can't do any
+	  constant folding, so don't bother going any further.
+	 */
+	if (constCount <= 1)
+	    return intrusive_ptr<Expression>(this);
+	    
+	/*
+	  If the operator isn't commutative or associative, there's nothing
+	  more we can do.  We test that by seeing if we can get a factory;
+	  if we can, we can use it to construct a temporary expression which
+	  we'll evaluate to collapse as many constants as we can down to
+	  a single one.
+	 */
+	intrusive_ptr<ExpressionNary> (*const pFactory)() = getFactory();
+	if (!pFactory)
+	    return intrusive_ptr<Expression>(this);
+
+	/*
+	  Create a new Expression that will be the replacement for this one.
+	  We actually create two:  one to hold constant expressions, and
+	  one to hold non-constants.  Once we've got these, we evaluate
+	  the constant expression to produce a single value, as above.
+	  We then add this operand to the end of the non-constant expression,
+	  and return that.
+	 */
+	intrusive_ptr<ExpressionNary> pNew((*pFactory)());
+	intrusive_ptr<ExpressionNary> pConst((*pFactory)());
+	for(size_t i = 0; i < n; ++i) {
+	    intrusive_ptr<Expression> pE(vpOperand[i]);
+	    if (dynamic_cast<ExpressionConstant *>(pE.get()))
+		pConst->addOperand(pE);
+	    else {
+		/*
+		  If the child operand is the same type as this, then we can
+		  extract its operands and inline them here because we already
+		  know this is commutative and associative because it has a
+		  factory.  We can detect sameness of the child operator by
+		  checking for equality of the factory
+
+		  Note we don't have to do this recursively, because we
+		  called optimize() on all the children first thing in
+		  this call to optimize().
+		*/
+		ExpressionNary *pNary =
+		    dynamic_cast<ExpressionNary *>(pE.get());
+		if (!pNary)
+		    pNew->addOperand(pE);
+		else {
+		    intrusive_ptr<ExpressionNary> (*const pChildFactory)() =
+			pNary->getFactory();
+		    if (pChildFactory != pFactory)
+			pNew->addOperand(pE);
+		    else {
+			/* same factory, so flatten */
+			size_t nChild = pNary->vpOperand.size();
+			for(size_t iChild = 0; iChild < nChild; ++iChild) {
+			    intrusive_ptr<Expression> pCE(
+				pNary->vpOperand[iChild]);
+			    if (dynamic_cast<ExpressionConstant *>(pCE.get()))
+				pConst->addOperand(pCE);
+			    else
+				pNew->addOperand(pCE);
+			}
+		    }
+		}
+	    }
+	}
+
+	/*
+	  If there was only one constant, add it to the end of the expression
+	  operand vector.
+	*/
+	if (pConst->vpOperand.size() == 1)
+	    pNew->addOperand(pConst->vpOperand[0]);
+	else if (pConst->vpOperand.size() > 1) {
+	    /*
+	      If there was more than one constant, collapse all the constants
+	      together before adding the result to the end of the expression
+	      operand vector.
+	    */
+	    intrusive_ptr<const Value> pResult(
+		pConst->evaluate(intrusive_ptr<Document>()));
+	    pNew->addOperand(ExpressionConstant::create(pResult));
+	}
+
+	return pNew;
+    }
+
+    void ExpressionNary::addOperand(
+        const intrusive_ptr<Expression> &pExpression) {
+        vpOperand.push_back(pExpression);
+    }
+
+    intrusive_ptr<ExpressionNary> (*ExpressionNary::getFactory() const)() {
+	return NULL;
+    }
+
+    void ExpressionNary::toBson(
+	BSONObjBuilder *pBuilder, const char *pOpName, unsigned depth) const {
+	const size_t nOperand = vpOperand.size();
+	assert(nOperand > 0);
+	if (nOperand == 1) {
+	    vpOperand[0]->addToBsonObj(pBuilder, pOpName, depth + 1);
+	    return;
+	}
+
+	/* build up the array */
+	BSONArrayBuilder arrBuilder;
+	for(size_t i = 0; i < nOperand; ++i)
+	    vpOperand[i]->addToBsonArray(&arrBuilder, depth + 1);
+
+	pBuilder->append(pOpName, arrBuilder.arr());
+    }
+
+    void ExpressionNary::addToBsonObj(
+	BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+	BSONObjBuilder exprBuilder;
+	toBson(&exprBuilder, getOpName(), depth);
+	pBuilder->append(fieldName, exprBuilder.done());
+    }
+
+    void ExpressionNary::addToBsonArray(
+	BSONArrayBuilder *pBuilder, unsigned depth) const {
+	BSONObjBuilder exprBuilder;
+	toBson(&exprBuilder, getOpName(), depth);
+	pBuilder->append(exprBuilder.done());
+    }
+
+    void ExpressionNary::checkArgLimit(unsigned maxArgs) const {
+	uassert(15993, str::stream() << getOpName() <<
+		" only takes " << maxArgs <<
+		" operand" << (maxArgs == 1 ? "" : "s"),
+		vpOperand.size() < maxArgs);
+    }
+
+    void ExpressionNary::checkArgCount(unsigned reqArgs) const {
+	uassert(15997, str::stream() << getOpName() <<
+		":  insufficient operands; " << reqArgs <<
+		" required, only got " << vpOperand.size(),
+		vpOperand.size() == reqArgs);
+    }
+
+    /* ----------------------- ExpressionNoOp ------------------------------ */
+
+    ExpressionNoOp::~ExpressionNoOp() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionNoOp::create() {
+        intrusive_ptr<ExpressionNoOp> pExpression(new ExpressionNoOp());
+        return pExpression;
+    }
+
+    intrusive_ptr<Expression> ExpressionNoOp::optimize() {
+	checkArgCount(1);
+	intrusive_ptr<Expression> pR(vpOperand[0]->optimize());
+	return pR;
+    }
+
+    ExpressionNoOp::ExpressionNoOp():
+        ExpressionNary() {
+    }
+
+    void ExpressionNoOp::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionNoOp::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pValue(vpOperand[0]->evaluate(pDocument));
+	return pValue;
+    }
+
+    const char *ExpressionNoOp::getOpName() const {
+	return "$noOp";
+    }
+
+    /* ------------------------- ExpressionNot ----------------------------- */
+
+    ExpressionNot::~ExpressionNot() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionNot::create() {
+        intrusive_ptr<ExpressionNot> pExpression(new ExpressionNot());
+        return pExpression;
+    }
+
+    ExpressionNot::ExpressionNot():
+        ExpressionNary() {
+    }
+
+    void ExpressionNot::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionNot::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pOp(vpOperand[0]->evaluate(pDocument));
+
+        bool b = pOp->coerceToBool();
+        if (b)
+            return Value::getFalse();
+        return Value::getTrue();
+    }
+
+    const char *ExpressionNot::getOpName() const {
+	return "$not";
+    }
+
+    /* -------------------------- ExpressionOr ----------------------------- */
+
+    ExpressionOr::~ExpressionOr() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionOr::create() {
+        intrusive_ptr<ExpressionNary> pExpression(new ExpressionOr());
+        return pExpression;
+    }
+
+    ExpressionOr::ExpressionOr():
+        ExpressionNary() {
+    }
+
+    intrusive_ptr<const Value> ExpressionOr::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        const size_t n = vpOperand.size();
+        for(size_t i = 0; i < n; ++i) {
+            intrusive_ptr<const Value> pValue(vpOperand[i]->evaluate(pDocument));
+            if (pValue->coerceToBool())
+                return Value::getTrue();
+        }
+
+        return Value::getFalse();
+    }
+
+    void ExpressionOr::toMatcherBson(
+	BSONObjBuilder *pBuilder, unsigned depth) const {
+	BSONObjBuilder opArray;
+	const size_t n = vpOperand.size();
+	for(size_t i = 0; i < n; ++i)
+	    vpOperand[i]->toMatcherBson(&opArray, depth + 1);
+
+	pBuilder->append("$or", opArray.done());
+    }
+
+    intrusive_ptr<ExpressionNary> (*ExpressionOr::getFactory() const)() {
+	return ExpressionOr::create;
+    }
+
+    intrusive_ptr<Expression> ExpressionOr::optimize() {
+	/* optimize the disjunction as much as possible */
+	intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+
+	/* if the result isn't a conjunction, we can't do anything */
+	ExpressionOr *pOr = dynamic_cast<ExpressionOr *>(pE.get());
+	if (!pOr)
+	    return pE;
+
+	/*
+	  Check the last argument on the result; if it's not constant (as
+	  promised by ExpressionNary::optimize(),) then there's nothing
+	  we can do.
+	*/
+	const size_t n = pOr->vpOperand.size();
+	intrusive_ptr<Expression> pLast(pOr->vpOperand[n - 1]);
+	const ExpressionConstant *pConst =
+	    dynamic_cast<ExpressionConstant *>(pLast.get());
+	if (!pConst)
+	    return pE;
+
+	/*
+	  Evaluate and coerce the last argument to a boolean.  If it's true,
+	  then we can replace this entire expression.
+	 */
+	bool last = pLast->evaluate(intrusive_ptr<Document>())->coerceToBool();
+	if (last) {
+	    intrusive_ptr<ExpressionConstant> pFinal(
+		ExpressionConstant::create(Value::getTrue()));
+	    return pFinal;
+	}
+
+	/*
+	  If we got here, the final operand was false, so we don't need it
+	  anymore.  If there was only one other operand, we don't need the
+	  conjunction either.  Note we still need to keep the promise that
+	  the result will be a boolean.
+	 */
+	if (n == 2) {
+	    intrusive_ptr<Expression> pFinal(
+		ExpressionCoerceToBool::create(pOr->vpOperand[0]));
+	    return pFinal;
+	}
+
+	/*
+	  Remove the final "false" value, and return the new expression.
+	*/
+	pOr->vpOperand.resize(n - 1);
+	return pE;
+    }
+
+    const char *ExpressionOr::getOpName() const {
+	return "$or";
+    }
+
+    /* ------------------------- ExpressionSecond ----------------------------- */
+
+    ExpressionSecond::~ExpressionSecond() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionSecond::create() {
+        intrusive_ptr<ExpressionSecond> pExpression(new ExpressionSecond());
+        return pExpression;
+    }
+
+    ExpressionSecond::ExpressionSecond():
+        ExpressionNary() {
+    }
+
+    void ExpressionSecond::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionSecond::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_sec);
+    }
+
+    const char *ExpressionSecond::getOpName() const {
+	return "$second";
+    }
+
+    /* ----------------------- ExpressionStrcasecmp ---------------------------- */
+
+    ExpressionStrcasecmp::~ExpressionStrcasecmp() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionStrcasecmp::create() {
+        intrusive_ptr<ExpressionStrcasecmp> pExpression(new ExpressionStrcasecmp());
+        return pExpression;
+    }
+
+    ExpressionStrcasecmp::ExpressionStrcasecmp():
+        ExpressionNary() {
+    }
+
+    void ExpressionStrcasecmp::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(2);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionStrcasecmp::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(2);
+        intrusive_ptr<const Value> pString1(vpOperand[0]->evaluate(pDocument));
+        intrusive_ptr<const Value> pString2(vpOperand[1]->evaluate(pDocument));
+
+        /* boost::iequals returns a bool not an int so strings must actually be allocated */
+        string str1 = boost::to_upper_copy( pString1->coerceToString() );
+        string str2 = boost::to_upper_copy( pString2->coerceToString() );
+        int result = str1.compare(str2);
+
+        if (result == 0)
+            return Value::getZero();
+        if (result > 0)
+            return Value::getOne();
+        return Value::getMinusOne();
+    }
+
+    const char *ExpressionStrcasecmp::getOpName() const {
+	return "$strcasecmp";
+    }
+
+    /* ----------------------- ExpressionSubstr ---------------------------- */
+
+    ExpressionSubstr::~ExpressionSubstr() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionSubstr::create() {
+        intrusive_ptr<ExpressionSubstr> pExpression(new ExpressionSubstr());
+        return pExpression;
+    }
+
+    ExpressionSubstr::ExpressionSubstr():
+        ExpressionNary() {
+    }
+
+    void ExpressionSubstr::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(3);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionSubstr::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(3);
+        intrusive_ptr<const Value> pString(vpOperand[0]->evaluate(pDocument));
+        intrusive_ptr<const Value> pLower(vpOperand[1]->evaluate(pDocument));
+        intrusive_ptr<const Value> pLength(vpOperand[2]->evaluate(pDocument));
+
+        string str = pString->coerceToString();
+	uassert(16034, str::stream() << getOpName() <<
+		":  starting index must be a numeric type (is BSON type " <<
+		pLower->getType() << ")",
+		(pLower->getType() == NumberInt 
+		 || pLower->getType() == NumberLong 
+		 || pLower->getType() == NumberDouble));
+	uassert(16035, str::stream() << getOpName() <<
+		":  length must be a numeric type (is BSON type " <<
+		pLength->getType() << ")",
+		(pLength->getType() == NumberInt 
+		 || pLength->getType() == NumberLong 
+		 || pLength->getType() == NumberDouble));
+        string::size_type lower = static_cast< string::size_type >( pLower->coerceToLong() );
+        string::size_type length = static_cast< string::size_type >( pLength->coerceToLong() );
+        return Value::createString( str.substr(lower, length) );
+    }
+
+    const char *ExpressionSubstr::getOpName() const {
+	return "$substr";
+    }
+
+    /* ----------------------- ExpressionSubtract ---------------------------- */
+
+    ExpressionSubtract::~ExpressionSubtract() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionSubtract::create() {
+        intrusive_ptr<ExpressionSubtract> pExpression(new ExpressionSubtract());
+        return pExpression;
+    }
+
+    ExpressionSubtract::ExpressionSubtract():
+        ExpressionNary() {
+    }
+
+    void ExpressionSubtract::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(2);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionSubtract::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+        BSONType productType;
+	checkArgCount(2);
+        intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+        intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+        if (pLeft->getType() == Date) {
+            long long right;
+            long long left = pLeft->coerceToDate();
+            if (pRight->getType() == Date)
+                right = pRight->coerceToDate();
+            else 
+                right = static_cast<long long>(pRight->coerceToDouble()*24*60*60*1000);
+            return Value::createDate(Date_t(left-right));
+        }
+            
+	uassert(15996, "cannot subtract one date from another",
+		pRight->getType() != Date);
+
+        productType = Value::getWidestNumeric(
+	    pRight->getType(), pLeft->getType());
+        
+
+        if (productType == NumberDouble) {
+            double right = pRight->coerceToDouble();
+            double left = pLeft->coerceToDouble();
+            return Value::createDouble(left - right);
+        } 
+
+        long long right = pRight->coerceToLong();
+        long long left = pLeft->coerceToLong();
+        if (productType == NumberLong)
+            return Value::createLong(left - right);
+        return Value::createInt((int)(left - right));
+    }
+
+    const char *ExpressionSubtract::getOpName() const {
+	return "$subtract";
+    }
+
+    /* ------------------------- ExpressionToLower ----------------------------- */
+
+    ExpressionToLower::~ExpressionToLower() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionToLower::create() {
+        intrusive_ptr<ExpressionToLower> pExpression(new ExpressionToLower());
+        return pExpression;
+    }
+
+    ExpressionToLower::ExpressionToLower():
+        ExpressionNary() {
+    }
+
+    void ExpressionToLower::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionToLower::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pString(vpOperand[0]->evaluate(pDocument));
+        string str = pString->coerceToString();
+        boost::to_lower(str);
+        return Value::createString(str);
+    }
+
+    const char *ExpressionToLower::getOpName() const {
+	return "$toLower";
+    }
+
+    /* ------------------------- ExpressionToUpper -------------------------- */
+
+    ExpressionToUpper::~ExpressionToUpper() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionToUpper::create() {
+        intrusive_ptr<ExpressionToUpper> pExpression(new ExpressionToUpper());
+        return pExpression;
+    }
+
+    ExpressionToUpper::ExpressionToUpper():
+        ExpressionNary() {
+    }
+
+    void ExpressionToUpper::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionToUpper::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pString(vpOperand[0]->evaluate(pDocument));
+        string str(pString->coerceToString());
+        boost::to_upper(str);
+        return Value::createString(str);
+    }
+
+    const char *ExpressionToUpper::getOpName() const {
+	return "$toUpper";
+    }
+
+    /* ------------------------- ExpressionWeek ----------------------------- */
+
+    ExpressionWeek::~ExpressionWeek() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionWeek::create() {
+        intrusive_ptr<ExpressionWeek> pExpression(new ExpressionWeek());
+        return pExpression;
+    }
+
+    ExpressionWeek::ExpressionWeek():
+        ExpressionNary() {
+    }
+
+    void ExpressionWeek::addOperand(const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionWeek::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        int dayOfWeek = date.tm_wday+1;
+        int dayOfYear = date.tm_yday;
+        int week = 0;
+        int janFirst = 0;
+        int offset = 0;
+
+        janFirst = dayOfWeek - dayOfYear % 7;
+        offset = (janFirst + 6) % 7;
+        week = (dayOfYear + offset) / 7;
+        return Value::createInt(week);
+    }
+
+    const char *ExpressionWeek::getOpName() const {
+	return "$week";
+    }
+
+    /* ------------------------- ExpressionYear ----------------------------- */
+
+    ExpressionYear::~ExpressionYear() {
+    }
+
+    intrusive_ptr<ExpressionNary> ExpressionYear::create() {
+        intrusive_ptr<ExpressionYear> pExpression(new ExpressionYear());
+        return pExpression;
+    }
+
+    ExpressionYear::ExpressionYear():
+        ExpressionNary() {
+    }
+
+    void ExpressionYear::addOperand(
+	const intrusive_ptr<Expression> &pExpression) {
+	checkArgLimit(1);
+        ExpressionNary::addOperand(pExpression);
+    }
+
+    intrusive_ptr<const Value> ExpressionYear::evaluate(
+        const intrusive_ptr<Document> &pDocument) const {
+	checkArgCount(1);
+        intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+        tm date;
+        (pDate->coerceToDate()).toTm(&date);
+        return Value::createInt(date.tm_year+1900); // tm_year is years since 1900
+    }
+
+    const char *ExpressionYear::getOpName() const {
+	return "$year";
+    }
+}
diff --git a/src/mongo/db/pipeline/expression.h b/src/mongo/db/pipeline/expression.h
new file mode 100755
index 00000000000..c49e385a3c7
--- /dev/null
+++ b/src/mongo/db/pipeline/expression.h
@@ -0,0 +1,1223 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include "db/pipeline/field_path.h"
+#include "util/intrusive_counter.h"
+
+
+namespace mongo {
+    class BSONArrayBuilder;
+    class BSONElement;
+    class BSONObjBuilder;
+    class Builder;
+    class Document;
+    class ExpressionContext;
+    class Value;
+
+    class Expression :
+        public IntrusiveCounterUnsigned {
+    public:
+        virtual ~Expression() {};
+
+	/*
+	  Optimize the Expression.
+
+	  This provides an opportunity to do constant folding, or to
+	  collapse nested operators that have the same precedence, such as
+	  $add, $and, or $or.
+
+	  The Expression should be replaced with the return value, which may
+	  or may not be the same object.  In the case of constant folding,
+	  a computed expression may be replaced by a constant.
+
+	  @returns the optimized Expression
+	 */
+	virtual intrusive_ptr<Expression> optimize() = 0;
+
+        /*
+          Evaluate the Expression using the given document as input.
+
+          @returns the computed value
+        */
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const = 0;
+
+	/*
+	  Add the Expression (and any descendant Expressions) into a BSON
+	  object that is under construction.
+
+	  Unevaluated Expressions always materialize as objects.  Evaluation
+	  may produce a scalar or another object, either of which will be
+	  substituted inline.
+
+	  @param pBuilder the builder to add the expression to
+	  @param fieldName the name the object should be given
+	 */
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName,
+	    unsigned depth) const = 0;
+
+	/*
+	  Add the Expression (and any descendant Expressions) into a BSON
+	  array that is under construction.
+
+	  Unevaluated Expressions always materialize as objects.  Evaluation
+	  may produce a scalar or another object, either of which will be
+	  substituted inline.
+
+	  @param pBuilder the builder to add the expression to
+	 */
+	virtual void addToBsonArray(BSONArrayBuilder *pBuilder,
+	    unsigned depth) const = 0;
+
+	/*
+	  Convert the expression into a BSONObj that corresponds to the
+	  db.collection.find() predicate language.  This is intended for
+	  use by DocumentSourceFilter.
+
+	  This is more limited than the full expression language supported
+	  by all available expressions in a DocumentSource processing
+	  pipeline, and will fail with an assertion if an attempt is made
+	  to go outside the bounds of the recognized patterns, which don't
+	  include full computed expressions.  There are other methods available
+	  on DocumentSourceFilter which can be used to analyze a filter
+	  predicate and break it up into appropriate expressions which can
+	  be translated within these constraints.  As a result, the default
+	  implementation is to fail with an assertion; only a subset of
+	  operators will be able to fulfill this request.
+
+	  @param pBuilder the builder to add the expression to.
+	 */
+	virtual void toMatcherBson(
+	    BSONObjBuilder *pBuilder, unsigned depth) const;
+
+	/*
+	  Utility class for parseObject() below.
+
+	  Only one array can be unwound in a processing pipeline.  If the
+	  UNWIND_OK option is used, unwindOk() will return true, and a field
+	  can be declared as unwound using unwind(), after which unwindUsed()
+	  will return true.  Only specify UNWIND_OK if it is OK to unwind an
+	  array in the current context.
+
+	  DOCUMENT_OK indicates that it is OK to use a Document in the current
+	  context.
+	 */
+        class ObjectCtx {
+        public:
+            ObjectCtx(int options);
+            static const int UNWIND_OK = 0x0001;
+            static const int DOCUMENT_OK = 0x0002;
+
+            bool unwindOk() const;
+            bool unwindUsed() const;
+            void unwind(string fieldName);
+
+            bool documentOk() const;
+
+        private:
+            int options;
+            string unwindField;
+        };
+
+	/*
+	  Parse a BSONElement Object.  The object could represent a functional
+	  expression or a Document expression.
+
+	  @param pBsonElement the element representing the object
+	  @param pCtx a MiniCtx representing the options above
+	  @returns the parsed Expression
+	 */
+        static intrusive_ptr<Expression> parseObject(
+            BSONElement *pBsonElement, ObjectCtx *pCtx);
+
+	static const char unwindName[];
+
+        /*
+	  Parse a BSONElement Object which has already been determined to be
+	  functional expression.
+
+	  @param pOpName the name of the (prefix) operator
+	  @param pBsonElement the BSONElement to parse
+	  @returns the parsed Expression
+	*/
+        static intrusive_ptr<Expression> parseExpression(
+            const char *pOpName, BSONElement *pBsonElement);
+
+
+	/*
+	  Parse a BSONElement which is an operand in an Expression.
+
+	  @param pBsonElement the expected operand's BSONElement
+	  @returns the parsed operand, as an Expression
+	 */
+        static intrusive_ptr<Expression> parseOperand(
+	    BSONElement *pBsonElement);
+
+	/*
+	  Produce a field path string with the field prefix removed.
+
+	  Throws an error if the field prefix is not present.
+
+	  @param prefixedField the prefixed field
+	  @returns the field path with the prefix removed
+	 */
+	static string removeFieldPrefix(const string &prefixedField);
+
+	/*
+	  Enumeration of comparison operators.  These are shared between a
+	  few expression implementations, so they are factored out here.
+
+	  Any changes to these values require adjustment of the lookup
+	  table in the implementation.
+	*/
+	enum CmpOp {
+	    EQ = 0, // return true for a == b, false otherwise
+	    NE = 1, // return true for a != b, false otherwise
+	    GT = 2, // return true for a > b, false otherwise
+	    GTE = 3, // return true for a >= b, false otherwise
+	    LT = 4, // return true for a < b, false otherwise
+	    LTE = 5, // return true for a <= b, false otherwise
+	    CMP = 6, // return -1, 0, 1 for a < b, a == b, a > b
+	};
+
+	static int signum(int i);
+    };
+
+
+    class ExpressionNary :
+	public Expression,
+        public boost::enable_shared_from_this<ExpressionNary> {
+    public:
+        // virtuals from Expression
+	virtual intrusive_ptr<Expression> optimize();
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+	virtual void addToBsonArray(
+	    BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+        /*
+          Add an operand to the n-ary expression.
+
+          @param pExpression the expression to add
+        */
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+	/*
+	  Return a factory function that will make Expression nodes of
+	  the same type as this.  This will be used to create constant
+	  expressions for constant folding for optimize().  Only return
+	  a factory function if this operator is both associative and
+	  commutative.  The default implementation returns NULL; optimize()
+	  will recognize that and stop.
+
+	  Note that ExpressionNary::optimize() promises that if it uses this
+	  to fold constants, then if optimize() returns an ExpressionNary,
+	  any remaining constant will be the last one in vpOperand.  Derived
+	  classes may take advantage of this to do further optimizations in
+	  their optimize().
+
+	  @returns pointer to a factory function or NULL
+	 */
+	virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+	/*
+	  Get the name of the operator.
+
+	  @returns the name of the operator; this string belongs to the class
+	    implementation, and should not be deleted
+	    and should not
+	*/
+	virtual const char *getOpName() const = 0;
+
+    protected:
+        ExpressionNary();
+
+        vector<intrusive_ptr<Expression> > vpOperand;
+
+	/*
+	  Add the expression to the builder.
+
+	  If there is only one operand (a unary operator), then the operand
+	  is added directly, without an array.  For more than one operand,
+	  a named array is created.  In both cases, the result is an object.
+
+	  @param pBuilder the (blank) builder to add the expression to
+	  @param pOpName the name of the operator
+	 */
+	virtual void toBson(BSONObjBuilder *pBuilder,
+			    const char *pOpName, unsigned depth) const;
+
+	/*
+	  Checks the current size of vpOperand; if the size equal to or
+	  greater than maxArgs, fires a user assertion indicating that this
+	  operator cannot have this many arguments.
+
+	  The equal is there because this is intended to be used in
+	  addOperand() to check for the limit *before* adding the requested
+	  argument.
+
+	  @param maxArgs the maximum number of arguments the operator accepts
+	*/
+	void checkArgLimit(unsigned maxArgs) const;
+
+	/*
+	  Checks the current size of vpOperand; if the size is not equal to
+	  reqArgs, fires a user assertion indicating that this must have
+	  exactly reqArgs arguments.
+
+	  This is meant to be used in evaluate(), *before* the evaluation
+	  takes place.
+
+	  @param reqArgs the number of arguments this operator requires
+	*/
+	void checkArgCount(unsigned reqArgs) const;
+    };
+
+
+    class ExpressionAdd :
+        public ExpressionNary {
+    public:
+        // virtuals from Expression
+        virtual ~ExpressionAdd();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+
+	// virtuals from ExpressionNary
+	virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+        /*
+          Create an expression that finds the sum of n operands.
+
+          @returns addition expression
+         */
+        static intrusive_ptr<ExpressionNary> create();
+
+    protected:
+	// virtuals from ExpressionNary
+	virtual void toBson(BSONObjBuilder *pBuilder,
+			    const char *pOpName, unsigned depth) const;
+
+    private:
+        ExpressionAdd();
+
+	/*
+	  If the operator can be optimized, we save the original here.
+
+	  This is necessary because addition must follow its original operand
+	  ordering strictly if a string is detected, otherwise string
+	  concatenation may appear to have re-ordered the operands.
+	 */
+	intrusive_ptr<ExpressionAdd> pAdd;
+	mutable bool useOriginal;
+    };
+
+
+    class ExpressionAnd :
+        public ExpressionNary {
+    public:
+        // virtuals from Expression
+        virtual ~ExpressionAnd();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+	virtual void toMatcherBson(
+	    BSONObjBuilder *pBuilder, unsigned depth) const;
+
+	// virtuals from ExpressionNary
+	virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+        /*
+          Create an expression that finds the conjunction of n operands.
+          The conjunction uses short-circuit logic; the expressions are
+          evaluated in the order they were added to the conjunction, and
+          the evaluation stops and returns false on the first operand that
+          evaluates to false.
+
+          @returns conjunction expression
+         */
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionAnd();
+    };
+
+
+    class ExpressionCoerceToBool :
+	public Expression,
+        public boost::enable_shared_from_this<ExpressionCoerceToBool> {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionCoerceToBool();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+	virtual void addToBsonArray(
+	    BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+        static intrusive_ptr<ExpressionCoerceToBool> create(
+	    const intrusive_ptr<Expression> &pExpression);
+
+    private:
+        ExpressionCoerceToBool(const intrusive_ptr<Expression> &pExpression);
+
+	intrusive_ptr<Expression> pExpression;
+    };
+
+
+    class ExpressionCompare :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionCompare();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        /*
+          Shorthands for creating various comparisons expressions.
+          Provide for conformance with the uniform function pointer signature
+          required for parsing.
+
+          These create a particular comparision operand, without any
+          operands.  Those must be added via ExpressionNary::addOperand().
+        */
+        static intrusive_ptr<ExpressionNary> createCmp();
+        static intrusive_ptr<ExpressionNary> createEq();
+        static intrusive_ptr<ExpressionNary> createNe();
+        static intrusive_ptr<ExpressionNary> createGt();
+        static intrusive_ptr<ExpressionNary> createGte();
+        static intrusive_ptr<ExpressionNary> createLt();
+        static intrusive_ptr<ExpressionNary> createLte();
+
+    private:
+	friend class ExpressionFieldRange;
+        ExpressionCompare(CmpOp cmpOp);
+
+        CmpOp cmpOp;
+    };
+
+
+    class ExpressionCond :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionCond();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionCond();
+    };
+
+
+    class ExpressionConstant :
+        public Expression,
+        public boost::enable_shared_from_this<ExpressionConstant> {
+    public:
+        // virtuals from Expression
+        virtual ~ExpressionConstant();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+	virtual void addToBsonArray(
+	    BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+        static intrusive_ptr<ExpressionConstant> createFromBsonElement(
+            BSONElement *pBsonElement);
+	static intrusive_ptr<ExpressionConstant> create(
+	    const intrusive_ptr<const Value> &pValue);
+
+	/*
+	  Get the constant value represented by this Expression.
+
+	  @returns the value
+	 */
+	intrusive_ptr<const Value> getValue() const;
+
+    private:
+        ExpressionConstant(BSONElement *pBsonElement);
+	ExpressionConstant(const intrusive_ptr<const Value> &pValue);
+
+        intrusive_ptr<const Value> pValue;
+    };
+
+
+    class ExpressionDayOfMonth :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionDayOfMonth();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionDayOfMonth();
+    };
+
+
+    class ExpressionDayOfWeek :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionDayOfWeek();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionDayOfWeek();
+    };
+
+
+    class ExpressionDayOfYear :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionDayOfYear();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionDayOfYear();
+    };
+
+
+    class ExpressionDivide :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionDivide();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionDivide();
+    };
+
+
+    class ExpressionFieldPath :
+        public Expression,
+        public boost::enable_shared_from_this<ExpressionFieldPath> {
+    public:
+        // virtuals from Expression
+        virtual ~ExpressionFieldPath();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+	virtual void addToBsonArray(
+	    BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+	/*
+	  Create a field path expression.
+
+	  Evaluation will extract the value associated with the given field
+	  path from the source document.
+
+	  @param fieldPath the field path string, without any leading document
+	    indicator
+	  @returns the newly created field path expression
+	 */
+        static intrusive_ptr<ExpressionFieldPath> create(
+	    const string &fieldPath);
+
+	/*
+	  Return a string representation of the field path.
+
+	  @param fieldPrefix whether or not to include the document field
+	    indicator prefix
+	  @returns the dot-delimited field path
+	 */
+	string getFieldPath(bool fieldPrefix) const;
+
+	/*
+	  Write a string representation of the field path to a stream.
+
+	  @param the stream to write to
+	  @param fieldPrefix whether or not to include the document field
+	    indicator prefix
+	 */
+	void writeFieldPath(ostream &outStream, bool fieldPrefix) const;
+
+    private:
+        ExpressionFieldPath(const string &fieldPath);
+
+	/*
+	  Internal implementation of evaluate(), used recursively.
+
+	  The internal implementation doesn't just use a loop because of
+	  the possibility that we need to skip over an array.  If the path
+	  is "a.b.c", and a is an array, then we fan out from there, and
+	  traverse "b.c" for each element of a:[...].  This requires that
+	  a be an array of objects in order to navigate more deeply.
+
+	  @param index current path field index to extract
+	  @param pathLength maximum number of fields on field path
+	  @param pDocument current document traversed to (not the top-level one)
+	  @returns the field found; could be an array
+	 */
+	intrusive_ptr<const Value> evaluatePath(
+	    size_t index, const size_t pathLength, 
+	    intrusive_ptr<Document> pDocument) const;
+
+	FieldPath fieldPath;
+    };
+
+
+    class ExpressionFieldRange :
+	public Expression,
+	public boost::enable_shared_from_this<ExpressionFieldRange> {
+    public:
+	// virtuals from expression
+        virtual ~ExpressionFieldRange();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+	virtual void addToBsonArray(
+	    BSONArrayBuilder *pBuilder, unsigned depth) const;
+	virtual void toMatcherBson(
+	    BSONObjBuilder *pBuilder, unsigned depth) const;
+
+	/*
+	  Create a field range expression.
+
+	  Field ranges are meant to match up with classic Matcher semantics,
+	  and therefore are conjunctions.  For example, these appear in
+	  mongo shell predicates in one of these forms:
+	  { a : C } -> (a == C) // degenerate "point" range
+	  { a : { $lt : C } } -> (a < C) // open range
+	  { a : { $gt : C1, $lte : C2 } } -> ((a > C1) && (a <= C2)) // closed
+
+	  When initially created, a field range only includes one end of
+	  the range.  Additional points may be added via intersect().
+
+	  Note that NE and CMP are not supported.
+
+	  @param pFieldPath the field path for extracting the field value
+	  @param cmpOp the comparison operator
+	  @param pValue the value to compare against
+	  @returns the newly created field range expression
+	 */
+	static intrusive_ptr<ExpressionFieldRange> create(
+	    const intrusive_ptr<ExpressionFieldPath> &pFieldPath,
+	    CmpOp cmpOp, const intrusive_ptr<const Value> &pValue);
+
+	/*
+	  Add an intersecting range.
+
+	  This can be done any number of times after creation.  The
+	  range is internally optimized for each new addition.  If the new
+	  intersection extends or reduces the values within the range, the
+	  internal representation is adjusted to reflect that.
+
+	  Note that NE and CMP are not supported.
+
+	  @param cmpOp the comparison operator
+	  @param pValue the value to compare against
+	 */
+	void intersect(CmpOp cmpOp, const intrusive_ptr<const Value> &pValue);
+
+    private:
+	ExpressionFieldRange(const intrusive_ptr<ExpressionFieldPath> &pFieldPath,
+			     CmpOp cmpOp,
+			     const intrusive_ptr<const Value> &pValue);
+
+	intrusive_ptr<ExpressionFieldPath> pFieldPath;
+
+	class Range {
+	public:
+	    Range(CmpOp cmpOp, const intrusive_ptr<const Value> &pValue);
+	    Range(const Range &rRange);
+
+	    Range *intersect(const Range *pRange) const;
+	    bool contains(const intrusive_ptr<const Value> &pValue) const;
+
+	    Range(const intrusive_ptr<const Value> &pBottom, bool bottomOpen,
+		  const intrusive_ptr<const Value> &pTop, bool topOpen);
+
+	    bool bottomOpen;
+	    bool topOpen;
+	    intrusive_ptr<const Value> pBottom;
+	    intrusive_ptr<const Value> pTop;
+	};
+
+	scoped_ptr<Range> pRange;
+
+	/*
+	  Add to a generic Builder.
+
+	  The methods to append items to an object and an array differ by
+	  their inclusion of a field name.  For more complicated objects,
+	  it makes sense to abstract that out and use a generic builder that
+	  always looks the same, and then implement addToBsonObj() and
+	  addToBsonArray() by using the common method.
+	 */
+	void addToBson(Builder *pBuilder, unsigned depth) const;
+    };
+
+
+    class ExpressionHour :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionHour();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionHour();
+    };
+
+
+    class ExpressionIfNull :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionIfNull();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionIfNull();
+    };
+
+
+    class ExpressionMinute :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionMinute();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionMinute();
+    };
+
+
+    class ExpressionMod :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionMod();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionMod();
+    };
+    
+
+    class ExpressionMultiply :
+        public ExpressionNary {
+    public:
+        // virtuals from Expression
+        virtual ~ExpressionMultiply();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+        virtual const char *getOpName() const;
+
+        // virtuals from ExpressionNary
+        virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+        /*
+          Create an expression that finds the product of n operands.
+
+          @returns multiplication expression
+         */
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionMultiply();
+    };
+
+
+    class ExpressionMonth :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionMonth();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionMonth();
+    };
+
+
+    class ExpressionNoOp :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionNoOp();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionNoOp();
+    };
+
+
+    class ExpressionNot :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionNot();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionNot();
+    };
+
+
+    class ExpressionObject :
+        public Expression,
+        public boost::enable_shared_from_this<ExpressionObject> {
+    public:
+        // virtuals from Expression
+        virtual ~ExpressionObject();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual void addToBsonObj(
+	    BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+	virtual void addToBsonArray(
+	    BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+	/*
+	  evaluate(), but return a Document instead of a Value-wrapped
+	  Document.
+
+	  @param pDocument the input Document
+	  @returns the result document
+	 */
+	intrusive_ptr<Document> evaluateDocument(
+	    const intrusive_ptr<Document> &pDocument) const;
+
+	/*
+	  evaluate(), but add the evaluated fields to a given document
+	  instead of creating a new one.
+
+	  @param pResult the Document to add the evaluated expressions to
+	  @param pDocument the input Document
+	 */
+	void addToDocument(const intrusive_ptr<Document> &pResult,
+			   const intrusive_ptr<Document> &pDocument) const;
+
+	/*
+	  Estimate the number of fields that will result from evaluating
+	  this over pDocument.  Does not include _id.  This is an estimate
+	  (really an upper bound) because we can't account for undefined
+	  fields without actually doing the evaluation.  But this is still
+	  useful as an argument to Document::create(), if you plan to use
+	  addToDocument().
+
+	  @param pDocument the input document
+	  @returns estimated number of fields that will result
+	 */
+	size_t getSizeHint(const intrusive_ptr<Document> &pDocument) const;
+
+        /*
+          Create an empty expression.  Until fields are added, this
+          will evaluate to an empty document (object).
+         */
+        static intrusive_ptr<ExpressionObject> create();
+
+        /*
+          Add a field to the document expression.
+
+          @param fieldPath the path the evaluated expression will have in the
+                 result Document
+          @param pExpression the expression to evaluate obtain this field's
+                 Value in the result Document
+        */
+        void addField(const string &fieldPath,
+		      const intrusive_ptr<Expression> &pExpression);
+
+	/*
+	  Add a field path to the set of those to be included.
+
+	  Note that including a nested field implies including everything on
+	  the path leading down to it.
+
+	  @param fieldPath the name of the field to be included
+	*/
+	void includePath(const string &fieldPath);
+
+	/*
+	  Add a field path to the set of those to be excluded.
+
+	  Note that excluding a nested field implies including everything on
+	  the path leading down to it (because you're stating you want to see
+	  all the other fields that aren't being excluded).
+
+	  @param fieldName the name of the field to be excluded
+	 */
+	void excludePath(const string &fieldPath);
+
+	/*
+	  Return the expression for a field.
+
+	  @param fieldName the field name for the expression to return
+	  @returns the expression used to compute the field, if it is present,
+	    otherwise NULL.
+	*/
+	intrusive_ptr<Expression> getField(const string &fieldName) const;
+
+	/*
+	  Get a count of the added fields.
+
+	  @returns how many fields have been added
+	 */
+	size_t getFieldCount() const;
+
+	/*
+	  Get a count of the exclusions.
+
+	  @returns how many fields have been excluded.
+	*/
+	size_t getExclusionCount() const;
+
+	/*
+	  Specialized BSON conversion that allows for writing out a
+	  $project specification.  This creates a standalone object, which must
+	  be added to a containing object with a name
+
+	  @param pBuilder where to write the object to
+	 */
+	void documentToBson(BSONObjBuilder *pBuilder, unsigned depth) const;
+
+    private:
+        ExpressionObject();
+
+	void includePath(
+	    const FieldPath *pPath, size_t pathi, size_t pathn,
+	    bool excludeLast);
+
+	bool excludePaths;
+	set<string> path;
+
+        /* these two vectors are maintained in parallel */
+        vector<string> vFieldName;
+        vector<intrusive_ptr<Expression> > vpExpression;
+
+	/*
+	  Utility function used by documentToBson().  Emits inclusion
+	  and exclusion paths by recursively walking down the nested
+	  ExpressionObject trees these have created.
+
+	  @param pBuilder the builder to write boolean valued path "fields" to
+	  @param pvPath pointer to a vector of strings describing the path on
+	    descent; the top-level call should pass an empty vector
+	 */
+	void emitPaths(BSONObjBuilder *pBuilder, vector<string> *pvPath) const;
+
+	/* utility class used by emitPaths() */
+	class PathPusher :
+	    boost::noncopyable {
+	public:
+	    PathPusher(vector<string> *pvPath, const string &s);
+	    ~PathPusher();
+
+	private:
+	    vector<string> *pvPath;
+	};
+    };
+
+
+    class ExpressionOr :
+        public ExpressionNary {
+    public:
+        // virtuals from Expression
+        virtual ~ExpressionOr();
+	virtual intrusive_ptr<Expression> optimize();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+	virtual void toMatcherBson(
+	    BSONObjBuilder *pBuilder, unsigned depth) const;
+
+	// virtuals from ExpressionNary
+	virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+        /*
+          Create an expression that finds the conjunction of n operands.
+          The conjunction uses short-circuit logic; the expressions are
+          evaluated in the order they were added to the conjunction, and
+          the evaluation stops and returns false on the first operand that
+          evaluates to false.
+
+          @returns conjunction expression
+         */
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionOr();
+    };
+
+
+    class ExpressionSecond :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionSecond();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionSecond();
+    };
+
+
+    class ExpressionStrcasecmp :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionStrcasecmp();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionStrcasecmp();
+    };
+
+
+    class ExpressionSubstr :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionSubstr();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionSubstr();
+    };
+
+
+    class ExpressionSubtract :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionSubtract();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionSubtract();
+    };
+
+
+    class ExpressionToLower :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionToLower();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionToLower();
+    };
+
+
+    class ExpressionToUpper :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionToUpper();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionToUpper();
+    };
+
+
+    class ExpressionWeek :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionWeek();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionWeek();
+    };
+
+
+    class ExpressionYear :
+        public ExpressionNary {
+    public:
+        // virtuals from ExpressionNary
+        virtual ~ExpressionYear();
+        virtual intrusive_ptr<const Value> evaluate(
+            const intrusive_ptr<Document> &pDocument) const;
+	virtual const char *getOpName() const;
+        virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+        static intrusive_ptr<ExpressionNary> create();
+
+    private:
+        ExpressionYear();
+    };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline bool Expression::ObjectCtx::unwindOk() const {
+        return ((options & UNWIND_OK) != 0);
+    }
+
+    inline bool Expression::ObjectCtx::unwindUsed() const {
+        return (unwindField.size() != 0);
+    }
+
+    inline int Expression::signum(int i) {
+	if (i < 0)
+	    return -1;
+	if (i > 0)
+	    return 1;
+	return 0;
+    }
+
+    inline intrusive_ptr<const Value> ExpressionConstant::getValue() const {
+	return pValue;
+    }
+
+    inline string ExpressionFieldPath::getFieldPath(bool fieldPrefix) const {
+	return fieldPath.getPath(fieldPrefix);
+    }
+
+    inline void ExpressionFieldPath::writeFieldPath(
+	ostream &outStream, bool fieldPrefix) const {
+	return fieldPath.writePath(outStream, fieldPrefix);
+    }
+
+    inline size_t ExpressionObject::getFieldCount() const {
+	return vFieldName.size();
+    }
+
+    inline ExpressionObject::PathPusher::PathPusher(
+	vector<string> *pTheVPath, const string &s):
+	pvPath(pTheVPath) {
+	pvPath->push_back(s);
+    }
+
+    inline ExpressionObject::PathPusher::~PathPusher() {
+	pvPath->pop_back();
+    }
+
+}
diff --git a/src/mongo/db/pipeline/expression_context.cpp b/src/mongo/db/pipeline/expression_context.cpp
new file mode 100755
index 00000000000..4835dcfa5a9
--- /dev/null
+++ b/src/mongo/db/pipeline/expression_context.cpp
@@ -0,0 +1,35 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/expression_context.h"
+
+namespace mongo {
+
+    ExpressionContext::~ExpressionContext() {
+    }
+
+    inline ExpressionContext::ExpressionContext():
+	inShard(false),
+	inRouter(false) {
+    }
+
+    ExpressionContext *ExpressionContext::create() {
+	return new ExpressionContext();
+    }
+
+}
diff --git a/src/mongo/db/pipeline/expression_context.h b/src/mongo/db/pipeline/expression_context.h
new file mode 100755
index 00000000000..0277039c80b
--- /dev/null
+++ b/src/mongo/db/pipeline/expression_context.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include "util/intrusive_counter.h"
+
+namespace mongo {
+
+    class ExpressionContext :
+        public IntrusiveCounterUnsigned {
+    public:
+	virtual ~ExpressionContext();
+
+	void setInShard(bool b);
+	void setInRouter(bool b);
+
+	bool getInShard() const;
+	bool getInRouter() const;
+
+	static ExpressionContext *create();
+
+    private:
+	ExpressionContext();
+	
+	bool inShard;
+	bool inRouter;
+    };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline void ExpressionContext::setInShard(bool b) {
+	inShard = b;
+    }
+    
+    inline void ExpressionContext::setInRouter(bool b) {
+	inRouter = b;
+    }
+
+    inline bool ExpressionContext::getInShard() const {
+	return inShard;
+    }
+
+    inline bool ExpressionContext::getInRouter() const {
+	return inRouter;
+    }
+
+};
diff --git a/src/mongo/db/pipeline/field_path.cpp b/src/mongo/db/pipeline/field_path.cpp
new file mode 100755
index 00000000000..96e1fc92f83
--- /dev/null
+++ b/src/mongo/db/pipeline/field_path.cpp
@@ -0,0 +1,87 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/field_path.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+    using namespace mongoutils;
+
+    FieldPath::~FieldPath() {
+    }
+
+    FieldPath::FieldPath():
+	vFieldName() {
+    }
+
+    FieldPath::FieldPath(const string &fieldPath):
+	vFieldName() {
+        /*
+          The field path could be using dot notation.
+          Break the field path up by peeling off successive pieces.
+        */
+        size_t startpos = 0;
+        while(true) {
+            /* find the next dot */
+            const size_t dotpos = fieldPath.find('.', startpos);
+
+            /* if there are no more dots, use the remainder of the string */
+            if (dotpos == fieldPath.npos) {
+                vFieldName.push_back(fieldPath.substr(startpos, dotpos));
+                break;
+            }
+
+            /* use the string up to the dot */
+            const size_t length = dotpos - startpos;
+	    uassert(15998, str::stream() <<
+		    "field names cannot be zero length (in path \"" <<
+		    fieldPath << "\")",
+		    length > 0);
+
+            vFieldName.push_back(fieldPath.substr(startpos, length));
+
+            /* next time, search starting one spot after that */
+            startpos = dotpos + 1;
+        }
+    }
+
+    string FieldPath::getPath(bool fieldPrefix) const {
+	stringstream ss;
+	writePath(ss, fieldPrefix);
+	return ss.str();
+    }
+
+    void FieldPath::writePath(ostream &outStream, bool fieldPrefix) const {
+	if (fieldPrefix)
+	    outStream << "$";
+
+	outStream << vFieldName[0];
+
+	const size_t n = vFieldName.size();
+	for(size_t i = 1; i < n; ++i)
+	    outStream << "." << vFieldName[i];
+    }
+
+    FieldPath &FieldPath::operator=(const FieldPath &rRHS) {
+	if (this != &rRHS) {
+	    vFieldName = rRHS.vFieldName;
+	}
+
+	return *this;
+    }
+
+}
diff --git a/src/mongo/db/pipeline/field_path.h b/src/mongo/db/pipeline/field_path.h
new file mode 100755
index 00000000000..810c5d0c7ea
--- /dev/null
+++ b/src/mongo/db/pipeline/field_path.h
@@ -0,0 +1,82 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+namespace mongo {
+
+    class FieldPath {
+    public:
+	virtual ~FieldPath();
+
+	FieldPath(const string &fieldPath);
+	FieldPath();
+
+	/*
+	  Get the number of path elements in the field path.
+
+	  @returns the number of path elements
+	 */
+	size_t getPathLength() const;
+
+	/*
+	  Get a particular path element from the path.
+
+	  @param i the index of the path element
+	  @returns the path element
+	 */
+	string getFieldName(size_t i) const;
+
+	/*
+	  Get the full path.
+
+	  @param fieldPrefix whether or not to include the field prefix
+	  @returns the complete field path
+	 */
+	string getPath(bool fieldPrefix) const;
+
+	/*
+	  Write the full path.
+
+	  @param outStream where to write the path to
+	  @param fieldPrefix whether or not to include the field prefix
+	*/
+	void writePath(ostream &outStream, bool fieldPrefix) const;
+
+	FieldPath &operator=(const FieldPath &rRHS);
+
+    private:
+	vector<string> vFieldName;
+    };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline size_t FieldPath::getPathLength() const {
+	return vFieldName.size();
+    }
+
+    inline string FieldPath::getFieldName(size_t i) const {
+	return vFieldName[i];
+    }
+
+}
+
diff --git a/src/mongo/db/pipeline/value.cpp b/src/mongo/db/pipeline/value.cpp
new file mode 100755
index 00000000000..b83dec359cf
--- /dev/null
+++ b/src/mongo/db/pipeline/value.cpp
@@ -0,0 +1,1034 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/value.h"
+
+#include <boost/functional/hash.hpp>
+#include "db/jsobj.h"
+#include "db/pipeline/builder.h"
+#include "db/pipeline/document.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+    using namespace mongoutils;
+
+    const intrusive_ptr<const Value> Value::pFieldUndefined(
+	new ValueStatic(Undefined));
+    const intrusive_ptr<const Value> Value::pFieldNull(new ValueStatic());
+    const intrusive_ptr<const Value> Value::pFieldTrue(new ValueStatic(true));
+    const intrusive_ptr<const Value> Value::pFieldFalse(new ValueStatic(false));
+    const intrusive_ptr<const Value> Value::pFieldMinusOne(new ValueStatic(-1));
+    const intrusive_ptr<const Value> Value::pFieldZero(new ValueStatic(0));
+    const intrusive_ptr<const Value> Value::pFieldOne(new ValueStatic(1));
+
+    Value::~Value() {
+    }
+
+    Value::Value():
+        type(jstNULL),
+        oidValue(),
+        dateValue(),
+        stringValue(),
+        pDocumentValue(),
+        vpValue() {
+    }
+
+    Value::Value(BSONType theType):
+        type(theType),
+        oidValue(),
+        dateValue(),
+        stringValue(),
+        pDocumentValue(),
+        vpValue() {
+	switch(type) {
+	case Undefined:
+	case jstNULL:
+	case Object: // empty
+	case Array: // empty
+	    break;
+
+	case NumberDouble:
+	    simple.doubleValue = 0;
+	    break;
+
+	case Bool:
+	    simple.boolValue = false;
+	    break;
+
+	case NumberInt:
+	    simple.intValue = 0;
+	    break;
+
+	case Timestamp:
+	    simple.timestampValue = 0;
+	    break;
+
+	case NumberLong:
+	    simple.longValue = 0;
+	    break;
+
+	default:
+	    // nothing else is allowed
+	    uassert(16001, str::stream() <<
+		    "can't create empty Value of type " << type, false);
+	    break;
+	}
+    }
+
+    Value::Value(bool boolValue):
+        type(Bool),
+        pDocumentValue(),
+        vpValue() {
+        simple.boolValue = boolValue;
+    }
+
+    intrusive_ptr<const Value> Value::createFromBsonElement(
+        BSONElement *pBsonElement) {
+        intrusive_ptr<const Value> pValue(new Value(pBsonElement));
+        return pValue;
+    }
+
+    Value::Value(BSONElement *pBsonElement):
+        type(pBsonElement->type()),
+        pDocumentValue(),
+        vpValue() {
+        switch(type) {
+        case NumberDouble:
+            simple.doubleValue = pBsonElement->Double();
+            break;
+
+        case String:
+            stringValue = pBsonElement->String();
+            break;
+
+        case Object: {
+            BSONObj document(pBsonElement->embeddedObject());
+            pDocumentValue = Document::createFromBsonObj(&document);
+            break;
+        }
+
+        case Array: {
+            vector<BSONElement> vElement(pBsonElement->Array());
+            const size_t n = vElement.size();
+
+            vpValue.reserve(n); // save on realloc()ing
+
+            for(size_t i = 0; i < n; ++i) {
+                vpValue.push_back(
+                    Value::createFromBsonElement(&vElement[i]));
+            }
+            break;
+        }
+
+        case jstOID:
+            oidValue = pBsonElement->OID();
+            break;
+
+        case Bool:
+            simple.boolValue = pBsonElement->Bool();
+            break;
+
+        case Date:
+            dateValue = pBsonElement->Date();
+            break;
+
+        case RegEx:
+            stringValue = pBsonElement->regex();
+            // TODO pBsonElement->regexFlags();
+            break;
+
+        case NumberInt:
+            simple.intValue = pBsonElement->numberInt();
+            break;
+
+        case Timestamp:
+            dateValue = pBsonElement->timestampTime();
+            break;
+
+        case NumberLong:
+            simple.longValue = pBsonElement->numberLong();
+            break;
+
+        case jstNULL:
+	    break;
+
+        case BinData:
+        case Symbol:
+        case CodeWScope:
+	    uassert(16002, str::stream() <<
+		    "can't create Value of type " << type, false);
+	    break;
+
+            /* these shouldn't happen in this context */
+        case MinKey:
+        case EOO:
+        case Undefined:
+        case DBRef:
+        case Code:
+        case MaxKey:
+            assert(false); // CW TODO better message
+            break;
+        }
+    }
+
+    Value::Value(int intValue):
+        type(NumberInt),
+        pDocumentValue(),
+        vpValue() {
+        simple.intValue = intValue;
+    }
+
+    intrusive_ptr<const Value> Value::createInt(int value) {
+        intrusive_ptr<const Value> pValue(new Value(value));
+        return pValue;
+    }
+
+    Value::Value(long long longValue):
+        type(NumberLong),
+        pDocumentValue(),
+        vpValue() {
+        simple.longValue = longValue;
+    }
+
+    intrusive_ptr<const Value> Value::createLong(long long value) {
+        intrusive_ptr<const Value> pValue(new Value(value));
+        return pValue;
+    }
+
+    Value::Value(double value):
+        type(NumberDouble),
+        pDocumentValue(),
+        vpValue() {
+        simple.doubleValue = value;
+    }
+
+    intrusive_ptr<const Value> Value::createDouble(double value) {
+        intrusive_ptr<const Value> pValue(new Value(value));
+        return pValue;
+    }
+
+    Value::Value(const Date_t &value):
+        type(Date),
+        pDocumentValue(),
+        vpValue() {
+        dateValue = value;
+    }
+
+    intrusive_ptr<const Value> Value::createDate(const Date_t &value) {
+        intrusive_ptr<const Value> pValue(new Value(value));
+        return pValue;
+    }
+
+    Value::Value(const string &value):
+        type(String),
+        pDocumentValue(),
+        vpValue() {
+        stringValue = value;
+    }
+
+    intrusive_ptr<const Value> Value::createString(const string &value) {
+        intrusive_ptr<const Value> pValue(new Value(value));
+        return pValue;
+    }
+
+    Value::Value(const intrusive_ptr<Document> &pDocument):
+        type(Object),
+        pDocumentValue(pDocument),
+        vpValue() {
+    }
+
+    intrusive_ptr<const Value> Value::createDocument(
+        const intrusive_ptr<Document> &pDocument) {
+        intrusive_ptr<const Value> pValue(new Value(pDocument));
+        return pValue;
+    }
+
+    Value::Value(const vector<intrusive_ptr<const Value> > &thevpValue):
+        type(Array),
+        pDocumentValue(),
+        vpValue(thevpValue) {
+    }
+
+    intrusive_ptr<const Value> Value::createArray(
+        const vector<intrusive_ptr<const Value> > &vpValue) {
+        intrusive_ptr<const Value> pValue(new Value(vpValue));
+        return pValue;
+    }
+
+    double Value::getDouble() const {
+        BSONType type = getType();
+        if (type == NumberInt)
+            return simple.intValue;
+        if (type == NumberLong)
+            return static_cast< double >( simple.longValue );
+
+        assert(type == NumberDouble);
+        return simple.doubleValue;
+    }
+
+    string Value::getString() const {
+        assert(getType() == String);
+        return stringValue;
+    }
+
+    intrusive_ptr<Document> Value::getDocument() const {
+        assert(getType() == Object);
+        return pDocumentValue;
+    }
+
+    ValueIterator::~ValueIterator() {
+    }
+
+    Value::vi::~vi() {
+    }
+
+    bool Value::vi::more() const {
+        return (nextIndex < size);
+    }
+
+    intrusive_ptr<const Value> Value::vi::next() {
+        assert(more());
+        return (*pvpValue)[nextIndex++];
+    }
+
+    Value::vi::vi(const intrusive_ptr<const Value> &pValue,
+                  const vector<intrusive_ptr<const Value> > *thepvpValue):
+        size(thepvpValue->size()),
+        nextIndex(0),
+        pvpValue(thepvpValue) {
+    }
+
+    intrusive_ptr<ValueIterator> Value::getArray() const {
+        assert(getType() == Array);
+        intrusive_ptr<ValueIterator> pVI(
+	    new vi(intrusive_ptr<const Value>(this), &vpValue));
+        return pVI;
+    }
+
+    OID Value::getOid() const {
+        assert(getType() == jstOID);
+        return oidValue;
+    }
+
+    bool Value::getBool() const {
+        assert(getType() == Bool);
+        return simple.boolValue;
+    }
+
+    Date_t Value::getDate() const {
+        assert(getType() == Date);
+        return dateValue;
+    }
+
+    string Value::getRegex() const {
+        assert(getType() == RegEx);
+        return stringValue;
+    }
+
+    string Value::getSymbol() const {
+        assert(getType() == Symbol);
+        return stringValue;
+    }
+
+    int Value::getInt() const {
+        assert(getType() == NumberInt);
+        return simple.intValue;
+    }
+
+    unsigned long long Value::getTimestamp() const {
+        assert(getType() == Timestamp);
+        return dateValue;
+    }
+
+    long long Value::getLong() const {
+        BSONType type = getType();
+        if (type == NumberInt)
+            return simple.intValue;
+
+        assert(type == NumberLong);
+        return simple.longValue;
+    }
+
+    void Value::addToBson(Builder *pBuilder) const {
+        switch(getType()) {
+        case NumberDouble:
+            pBuilder->append(getDouble());
+            break;
+
+        case String:
+            pBuilder->append(getString());
+            break;
+
+        case Object: {
+            intrusive_ptr<Document> pDocument(getDocument());
+            BSONObjBuilder subBuilder;
+            pDocument->toBson(&subBuilder);
+            subBuilder.done();
+            pBuilder->append(&subBuilder);
+            break;
+        }
+
+        case Array: {
+            const size_t n = vpValue.size();
+            BSONArrayBuilder arrayBuilder(n);
+            for(size_t i = 0; i < n; ++i) {
+                vpValue[i]->addToBsonArray(&arrayBuilder);
+            }
+
+            pBuilder->append(&arrayBuilder);
+            break;
+        }
+
+        case BinData:
+            // pBuilder->appendBinData(fieldName, ...);
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case jstOID:
+            pBuilder->append(getOid());
+            break;
+
+        case Bool:
+            pBuilder->append(getBool());
+            break;
+
+        case Date:
+            pBuilder->append(getDate());
+            break;
+
+        case RegEx:
+            pBuilder->append(getRegex());
+            break;
+
+        case Symbol:
+            pBuilder->append(getSymbol());
+            break;
+
+        case CodeWScope:
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case NumberInt:
+            pBuilder->append(getInt());
+            break;
+
+        case Timestamp:
+            pBuilder->append((long long)getTimestamp());
+            break;
+
+        case NumberLong:
+            pBuilder->append(getLong());
+            break;
+
+	case jstNULL:
+	    pBuilder->append();
+	    break;
+
+            /* these shouldn't appear in this context */
+        case MinKey:
+        case EOO:
+        case Undefined:
+        case DBRef:
+        case Code:
+        case MaxKey:
+            assert(false); // CW TODO better message
+            break;
+        }
+    }
+
+    void Value::addToBsonObj(BSONObjBuilder *pBuilder, string fieldName) const {
+	BuilderObj objBuilder(pBuilder, fieldName);
+	addToBson(&objBuilder);
+    }
+
+    void Value::addToBsonArray(BSONArrayBuilder *pBuilder) const {
+	BuilderArray arrBuilder(pBuilder);
+	addToBson(&arrBuilder);
+    }
+
+    bool Value::coerceToBool() const {
+        BSONType type = getType();
+        switch(type) {
+        case NumberDouble:
+            if (simple.doubleValue != 0)
+                return true;
+            break;
+
+        case String:
+        case Object:
+        case Array:
+        case BinData:
+        case jstOID:
+        case Date:
+        case RegEx:
+        case Symbol:
+        case Timestamp:
+            return true;
+
+        case Bool:
+            if (simple.boolValue)
+                return true;
+            break;
+
+        case CodeWScope:
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case NumberInt:
+            if (simple.intValue != 0)
+                return true;
+            break;
+
+        case NumberLong:
+            if (simple.longValue != 0)
+                return true;
+            break;
+
+        case jstNULL:
+        case Undefined:
+            /* nothing to do */
+            break;
+
+            /* these shouldn't happen in this context */
+        case MinKey:
+        case EOO:
+        case DBRef:
+        case Code:
+        case MaxKey:
+            assert(false); // CW TODO better message
+            break;
+        }
+
+        return false;
+    }
+
+    intrusive_ptr<const Value> Value::coerceToBoolean() const {
+        bool result = coerceToBool();
+
+        /* always normalize to the singletons */
+        if (result)
+            return Value::getTrue();
+        return Value::getFalse();
+    }
+
+    int Value::coerceToInt() const {
+        switch(type) {
+        case NumberDouble:
+            return (int)simple.doubleValue;
+
+        case NumberInt:
+            return simple.intValue;
+
+        case NumberLong:
+            return (int)simple.longValue;
+
+	case jstNULL:
+	case Undefined:
+	    break;
+
+        case String:
+        default:
+	    uassert(16003, str::stream() <<
+		    "can't convert from BSON type " << type <<
+		    " to int",
+		    false);
+        } // switch(type)
+
+        return (int)0;
+    }
+
+    long long Value::coerceToLong() const {
+        switch(type) {
+        case NumberDouble:
+            return (long long)simple.doubleValue;
+
+        case NumberInt:
+            return simple.intValue;
+
+        case NumberLong:
+            return simple.longValue;
+
+	case jstNULL:
+	case Undefined:
+	    break;
+
+        case String:
+        default:
+	    uassert(16004, str::stream() <<
+		    "can't convert from BSON type " << type <<
+		    " to long",
+		    false);
+        } // switch(type)
+
+        return (long long)0;
+    }
+
+    double Value::coerceToDouble() const {
+        switch(type) {
+        case NumberDouble:
+            return simple.doubleValue;
+
+        case NumberInt:
+            return (double)simple.intValue;
+
+        case NumberLong:
+            return (double)simple.longValue;
+
+	case jstNULL:
+	case Undefined:
+	    break;
+
+        case String:
+        default:
+	    uassert(16005, str::stream() <<
+		    "can't convert from BSON type " << type <<
+		    " to double",
+		    false);
+        } // switch(type)
+
+        return (double)0;
+    }
+
+    Date_t Value::coerceToDate() const {
+        switch(type) {
+
+        case Date:
+            return dateValue; 
+
+	case jstNULL:
+	case Undefined:
+	    break;
+
+        default:
+	    uassert(16006, str::stream() <<
+		    "can't convert from BSON type " << type <<
+		    " to double",
+		    false);
+        } // switch(type)
+
+            assert(false); // CW TODO no conversion available
+        return jstNULL; 
+    }
+
+    string Value::coerceToString() const {
+        stringstream ss;
+        switch(type) {
+        case NumberDouble:
+            ss << simple.doubleValue;
+            return ss.str();
+
+        case NumberInt:
+            ss << simple.intValue;
+            return ss.str();
+
+        case NumberLong:
+            ss << simple.longValue;
+            return ss.str();
+
+        case String:
+            return stringValue;
+
+        case Date:
+            return dateValue.toString();
+
+	case jstNULL:
+	case Undefined:
+	    break;
+
+        default:
+	    uassert(16007, str::stream() <<
+		    "can't convert from BSON type " << type <<
+		    " to double",
+		    false);
+        } // switch(type)
+
+        return "";
+    }
+
+    int Value::compare(const intrusive_ptr<const Value> &rL,
+                       const intrusive_ptr<const Value> &rR) {
+        BSONType lType = rL->getType();
+	BSONType rType = rR->getType();
+
+	/*
+	  Special handling for Undefined and NULL values; these are types,
+	  so it's easier to handle them here before we go below to handle
+	  values of the same types.  This allows us to compare Undefined and
+	  NULL values with everything else.  As coded now:
+	  (*) Undefined is less than everything except itself (which is equal)
+	  (*) NULL is less than everything except Undefined and itself
+	 */
+	if (lType == Undefined) {
+	    if (rType == Undefined)
+		return 0;
+
+	    /* if rType is anything else, the left value is less */
+	    return -1;
+	}
+	
+	if (lType == jstNULL) {
+	    if (rType == Undefined)
+		return 1;
+	    if (rType == jstNULL)
+		return 0;
+
+	    return -1;
+	}
+
+	if ((rType == Undefined) || (rType == jstNULL)) {
+	    /*
+	      We know the left value isn't Undefined, because of the above.
+	      Count a NULL value as greater than an undefined one.
+	    */
+	    return 1;
+	}
+
+        // CW TODO for now, only compare like values
+	uassert(16016, str::stream() <<
+		"can't compare values of BSON types " << lType <<
+		" and " << rType,
+		lType == rType);
+
+        switch(lType) {
+        case NumberDouble:
+            if (rL->simple.doubleValue < rR->simple.doubleValue)
+                return -1;
+            if (rL->simple.doubleValue > rR->simple.doubleValue)
+                return 1;
+            return 0;
+
+        case String:
+            return rL->stringValue.compare(rR->stringValue);
+
+        case Object:
+            return Document::compare(rL->getDocument(), rR->getDocument());
+
+        case Array: {
+            intrusive_ptr<ValueIterator> pli(rL->getArray());
+            intrusive_ptr<ValueIterator> pri(rR->getArray());
+
+            while(true) {
+                /* have we run out of left array? */
+                if (!pli->more()) {
+                    if (!pri->more())
+                        return 0; // the arrays are the same length
+
+                    return -1; // the left array is shorter
+                }
+
+                /* have we run out of right array? */
+                if (!pri->more())
+                    return 1; // the right array is shorter
+
+                /* compare the two corresponding elements */
+                intrusive_ptr<const Value> plv(pli->next());
+                intrusive_ptr<const Value> prv(pri->next());
+                const int cmp = Value::compare(plv, prv);
+                if (cmp)
+                    return cmp; // values are unequal
+            }
+
+            /* NOTREACHED */
+            assert(false);
+            break;
+        }
+
+        case BinData:
+            // pBuilder->appendBinData(fieldName, ...);
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case jstOID:
+            if (rL->oidValue < rR->oidValue)
+                return -1;
+            if (rL->oidValue == rR->oidValue)
+                return 0;
+            return 1;
+
+        case Bool:
+            if (rL->simple.boolValue == rR->simple.boolValue)
+                return 0;
+            if (rL->simple.boolValue)
+                return 1;
+            return -1;
+
+        case Date:
+            if (rL->dateValue < rR->dateValue)
+                return -1;
+            if (rL->dateValue > rR->dateValue)
+                return 1;
+            return 0;
+
+        case RegEx:
+            return rL->stringValue.compare(rR->stringValue);
+
+        case Symbol:
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case CodeWScope:
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case NumberInt:
+            if (rL->simple.intValue < rR->simple.intValue)
+                return -1;
+            if (rL->simple.intValue > rR->simple.intValue)
+                return 1;
+            return 0;
+
+        case Timestamp:
+            if (rL->dateValue < rR->dateValue)
+                return -1;
+            if (rL->dateValue > rR->dateValue)
+                return 1;
+            return 0;
+
+        case NumberLong:
+            if (rL->simple.longValue < rR->simple.longValue)
+                return -1;
+            if (rL->simple.longValue > rR->simple.longValue)
+                return 1;
+            return 0;
+
+        case Undefined:
+        case jstNULL:
+	    return 0; // treat two Undefined or NULL values as equal
+
+            /* these shouldn't happen in this context */
+        case MinKey:
+        case EOO:
+        case DBRef:
+        case Code:
+        case MaxKey:
+            assert(false); // CW TODO better message
+            break;
+        } // switch(lType)
+
+        /* NOTREACHED */
+        return 0;
+    }
+
+    void Value::hash_combine(size_t &seed) const {
+	BSONType type = getType();
+	boost::hash_combine(seed, (int)type);
+
+        switch(type) {
+        case NumberDouble:
+	    boost::hash_combine(seed, simple.doubleValue);
+	    break;
+
+        case String:
+	    boost::hash_combine(seed, stringValue);
+	    break;
+
+        case Object:
+	    getDocument()->hash_combine(seed);
+	    break;
+
+        case Array: {
+	    intrusive_ptr<ValueIterator> pIter(getArray());
+	    while(pIter->more()) {
+		intrusive_ptr<const Value> pValue(pIter->next());
+		pValue->hash_combine(seed);
+	    };
+            break;
+        }
+
+        case BinData:
+            // pBuilder->appendBinData(fieldName, ...);
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case jstOID:
+	    oidValue.hash_combine(seed);
+	    break;
+
+        case Bool:
+	    boost::hash_combine(seed, simple.boolValue);
+	    break;
+
+        case Date:
+	    boost::hash_combine(seed, (unsigned long long)dateValue);
+	    break;
+
+        case RegEx:
+	    boost::hash_combine(seed, stringValue);
+	    break;
+
+        case Symbol:
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case CodeWScope:
+            assert(false); // CW TODO unimplemented
+            break;
+
+        case NumberInt:
+	    boost::hash_combine(seed, simple.intValue);
+	    break;
+
+        case Timestamp:
+	    boost::hash_combine(seed, (unsigned long long)dateValue);
+	    break;
+
+        case NumberLong:
+	    boost::hash_combine(seed, simple.longValue);
+	    break;
+
+        case Undefined:
+        case jstNULL:
+	    break;
+
+            /* these shouldn't happen in this context */
+        case MinKey:
+        case EOO:
+        case DBRef:
+        case Code:
+        case MaxKey:
+            assert(false); // CW TODO better message
+            break;
+        } // switch(type)
+    }
+
+    BSONType Value::getWidestNumeric(BSONType lType, BSONType rType) {
+	if (lType == NumberDouble) {
+	    switch(rType) {
+	    case NumberDouble:
+	    case NumberLong:
+	    case NumberInt:
+	    case jstNULL:
+	    case Undefined:
+		return NumberDouble;
+
+	    default:
+		break;
+	    }
+	}
+	else if (lType == NumberLong) {
+	    switch(rType) {
+	    case NumberDouble:
+		return NumberDouble;
+
+	    case NumberLong:
+	    case NumberInt:
+	    case jstNULL:
+	    case Undefined:
+		return NumberLong;
+
+	    default:
+		break;
+	    }
+	}
+	else if (lType == NumberInt) {
+	    switch(rType) {
+	    case NumberDouble:
+		return NumberDouble;
+
+	    case NumberLong:
+		return NumberLong;
+
+	    case NumberInt:
+	    case jstNULL:
+	    case Undefined:
+		return NumberInt;
+
+	    default:
+		break;
+	    }
+	}
+	else if ((lType == jstNULL) || (lType == Undefined)) {
+	    switch(rType) {
+	    case NumberDouble:
+		return NumberDouble;
+
+	    case NumberLong:
+		return NumberLong;
+
+	    case NumberInt:
+		return NumberInt;
+
+	    default:
+		break;
+	    }
+	}
+
+        /* NOTREACHED */
+        return Undefined;
+    }
+
+    size_t Value::getApproximateSize() const {
+        switch(type) {
+        case String:
+	    return sizeof(Value) + stringValue.length();
+
+        case Object:
+	    return sizeof(Value) + pDocumentValue->getApproximateSize();
+
+        case Array: {
+	    size_t size = sizeof(Value);
+            const size_t n = vpValue.size();
+            for(size_t i = 0; i < n; ++i) {
+		size += vpValue[i]->getApproximateSize();
+            }
+	    return size;
+        }
+
+	case NumberDouble:
+        case BinData:
+        case jstOID:
+        case Bool:
+        case Date:
+        case RegEx:
+        case Symbol:
+        case CodeWScope:
+        case NumberInt:
+        case Timestamp:
+        case NumberLong:
+        case jstNULL:
+        case Undefined:
+	    return sizeof(Value);
+
+            /* these shouldn't happen in this context */
+        case MinKey:
+        case EOO:
+        case DBRef:
+        case Code:
+        case MaxKey:
+            assert(false); // CW TODO better message
+	    return sizeof(Value);
+        }
+
+	/*
+	  We shouldn't get here.  In order to make the implementor think about
+	  these cases, they are all listed explicitly, above.  The compiler
+	  should complain if they aren't all listed, because there's no
+	  default.  However, not all the compilers seem to do that.  Therefore,
+	  this final catch-all is here.
+	 */
+	assert(false);
+	return sizeof(Value);
+    }
+
+
+    void ValueStatic::addRef() const {
+    }
+
+    void ValueStatic::release() const {
+    }
+
+}
diff --git a/src/mongo/db/pipeline/value.h b/src/mongo/db/pipeline/value.h
new file mode 100755
index 00000000000..8bd1bcbbbfd
--- /dev/null
+++ b/src/mongo/db/pipeline/value.h
@@ -0,0 +1,468 @@
+/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or  modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+#include "bson/bsontypes.h"
+#include "util/intrusive_counter.h"
+
+namespace mongo {
+    class BSONElement;
+    class Builder;
+    class Document;
+    class Value;
+
+    class ValueIterator :
+        public IntrusiveCounterUnsigned {
+    public:
+        virtual ~ValueIterator();
+
+        /*
+          Ask if there are more fields to return.
+
+          @returns true if there are more fields, false otherwise
+        */
+        virtual bool more() const = 0;
+
+        /*
+          Move the iterator to point to the next field and return it.
+
+          @returns the next field's <name, Value>
+        */
+        virtual intrusive_ptr<const Value> next() = 0;
+    };
+
+
+    /*
+      Values are immutable, so these are passed around as
+      intrusive_ptr<const Value>.
+     */
+    class Value :
+        public IntrusiveCounterUnsigned {
+    public:
+        ~Value();
+
+        /*
+          Construct a Value from a BSONElement.
+
+	  This ignores the name of the element, and only uses the value,
+	  whatever type it is.
+
+          @returns a new Value initialized from the bsonElement
+        */
+        static intrusive_ptr<const Value> createFromBsonElement(
+            BSONElement *pBsonElement);
+
+        /*
+          Construct an integer-valued Value.
+
+          For commonly used values, consider using one of the singleton
+          instances defined below.
+
+          @param value the value
+          @returns a Value with the given value
+        */
+        static intrusive_ptr<const Value> createInt(int value);
+
+        /*
+          Construct an long(long)-valued Value.
+
+          For commonly used values, consider using one of the singleton
+          instances defined below.
+
+          @param value the value
+          @returns a Value with the given value
+        */
+        static intrusive_ptr<const Value> createLong(long long value);
+
+        /*
+          Construct a double-valued Value.
+
+          @param value the value
+          @returns a Value with the given value
+        */
+        static intrusive_ptr<const Value> createDouble(double value);
+
+        /*
+          Construct a string-valued Value.
+
+          @param value the value
+          @returns a Value with the given value
+        */
+        static intrusive_ptr<const Value> createString(const string &value);
+
+        /*
+          Construct a date-valued Value.
+
+          @param value the value
+          @returns a Value with the given value
+        */
+        static intrusive_ptr<const Value> createDate(const Date_t &value);
+
+        /*
+          Construct a document-valued Value.
+
+          @param value the value
+          @returns a Value with the given value
+        */
+        static intrusive_ptr<const Value> createDocument(
+            const intrusive_ptr<Document> &pDocument);
+
+        /*
+          Construct an array-valued Value.
+
+          @param value the value
+          @returns a Value with the given value
+        */
+        static intrusive_ptr<const Value> createArray(
+            const vector<intrusive_ptr<const Value> > &vpValue);
+
+        /*
+          Get the BSON type of the field.
+
+          If the type is jstNULL, no value getter will work.
+
+          @return the BSON type of the field.
+        */
+        BSONType getType() const;
+
+        /*
+          Getters.
+
+          @returns the Value's value; asserts if the requested value type is
+          incorrect.
+        */
+        double getDouble() const;
+        string getString() const;
+        intrusive_ptr<Document> getDocument() const;
+        intrusive_ptr<ValueIterator> getArray() const;
+        OID getOid() const;
+        bool getBool() const;
+        Date_t getDate() const;
+        string getRegex() const;
+        string getSymbol() const;
+        int getInt() const;
+        unsigned long long getTimestamp() const;
+        long long getLong() const;
+
+	/*
+	  Get the length of an array value.
+
+	  @returns the length of the array, if this is array-valued; otherwise
+	     throws an error
+	*/
+	size_t getArrayLength() const;
+
+        /*
+          Add this value to the BSON object under construction.
+        */
+        void addToBsonObj(BSONObjBuilder *pBuilder, string fieldName) const;
+
+        /*
+          Add this field to the BSON array under construction.
+
+          As part of an array, the Value's name will be ignored.
+        */
+        void addToBsonArray(BSONArrayBuilder *pBuilder) const;
+
+        /*
+          Get references to singleton instances of commonly used field values.
+         */
+	static intrusive_ptr<const Value> getUndefined();
+        static intrusive_ptr<const Value> getNull();
+        static intrusive_ptr<const Value> getTrue();
+        static intrusive_ptr<const Value> getFalse();
+        static intrusive_ptr<const Value> getMinusOne();
+        static intrusive_ptr<const Value> getZero();
+        static intrusive_ptr<const Value> getOne();
+
+        /*
+          Coerce (cast) a value to a native bool, using JSON rules.
+
+          @returns the bool value
+        */
+        bool coerceToBool() const;
+
+        /*
+          Coerce (cast) a value to a Boolean Value, using JSON rules.
+
+          @returns the Boolean Value value
+        */
+        intrusive_ptr<const Value> coerceToBoolean() const;
+
+        /*
+          Coerce (cast) a value to an int, using JSON rules.
+
+          @returns the int value
+        */
+        int coerceToInt() const;
+
+        /*
+          Coerce (cast) a value to a long long, using JSON rules.
+
+          @returns the long value
+        */
+        long long coerceToLong() const;
+
+        /*
+          Coerce (cast) a value to a double, using JSON rules.
+
+          @returns the double value
+        */
+        double coerceToDouble() const;
+
+        /*
+          Coerce (cast) a value to a date, using JSON rules.
+
+          @returns the date value
+        */
+        Date_t coerceToDate() const;
+
+        /*
+          Coerce (cast) a value to a string, using JSON rules.
+
+          @returns the date value
+        */
+        string coerceToString() const;
+
+        /*
+          Compare two Values.
+
+          @param rL left value
+          @param rR right value
+          @returns an integer less than zero, zero, or an integer greater than
+            zero, depending on whether rL < rR, rL == rR, or rL > rR
+         */
+        static int compare(const intrusive_ptr<const Value> &rL,
+                           const intrusive_ptr<const Value> &rR);
+
+
+        /*
+          Figure out what the widest of two numeric types is.
+
+          Widest can be thought of as "most capable," or "able to hold the
+          largest or most precise value."  The progression is Int, Long, Double.
+
+          @param rL left value
+          @param rR right value
+          @returns a BSONType of NumberInt, NumberLong, or NumberDouble
+        */
+        static BSONType getWidestNumeric(BSONType lType, BSONType rType);
+
+	/*
+	  Get the approximate storage size of the value, in bytes.
+
+	  @returns approximate storage size of the value.
+	 */
+	size_t getApproximateSize() const;
+
+	/*
+	  Calculate a hash value.
+
+	  Meant to be used to create composite hashes suitable for
+	  boost classes such as unordered_map<>.
+
+	  @param seed value to augment with this' hash
+	*/
+	void hash_combine(size_t &seed) const;
+
+	/*
+	  struct Hash is defined to enable the use of Values as
+	  keys in boost::unordered_map<>.
+
+	  Values are always referenced as immutables in the form
+	  intrusive_ptr<const Value>, so these operate on that construction.
+	*/
+	struct Hash :
+	    unary_function<intrusive_ptr<const Value>, size_t> {
+	    size_t operator()(const intrusive_ptr<const Value> &rV) const;
+	};
+
+    protected:
+        Value(); // creates null value
+	Value(BSONType type); // creates an empty (unitialized value) of type
+	                                        // mostly useful for Undefined
+        Value(bool boolValue);
+        Value(int intValue);
+
+    private:
+        Value(BSONElement *pBsonElement);
+
+        Value(long long longValue);
+        Value(double doubleValue);
+        Value(const Date_t &dateValue);
+        Value(const string &stringValue);
+        Value(const intrusive_ptr<Document> &pDocument);
+        Value(const vector<intrusive_ptr<const Value> > &vpValue);
+
+	void addToBson(Builder *pBuilder) const;
+
+        BSONType type;
+
+        /* store value in one of these */
+        union {
+            double doubleValue;
+            bool boolValue;
+            int intValue;
+            unsigned long long timestampValue;
+            long long longValue;
+
+        } simple; // values that don't need a ctor/dtor
+        OID oidValue;
+        Date_t dateValue;
+        string stringValue; // String, Regex, Symbol
+        intrusive_ptr<Document> pDocumentValue;
+        vector<intrusive_ptr<const Value> > vpValue; // for arrays
+
+
+        /*
+        These are often used as the result of boolean or comparison
+        expressions.
+
+        These are obtained via public static getters defined above.
+        */
+	static const intrusive_ptr<const Value> pFieldUndefined;
+        static const intrusive_ptr<const Value> pFieldNull;
+        static const intrusive_ptr<const Value> pFieldTrue;
+        static const intrusive_ptr<const Value> pFieldFalse;
+        static const intrusive_ptr<const Value> pFieldMinusOne;
+        static const intrusive_ptr<const Value> pFieldZero;
+        static const intrusive_ptr<const Value> pFieldOne;
+
+        /* this implementation is used for getArray() */
+        class vi :
+            public ValueIterator {
+        public:
+            // virtuals from ValueIterator
+	    virtual ~vi();
+            virtual bool more() const;
+            virtual intrusive_ptr<const Value> next();
+
+        private:
+            friend class Value;
+            vi(const intrusive_ptr<const Value> &pSource,
+               const vector<intrusive_ptr<const Value> > *pvpValue);
+
+            size_t size;
+            size_t nextIndex;
+            const vector<intrusive_ptr<const Value> > *pvpValue;
+	}; /* class vi */
+
+    };
+
+    /*
+      Equality operator for values.
+
+      Useful for unordered_map<>, etc.
+     */
+    inline bool operator==(const intrusive_ptr<const Value> &v1,
+		    const intrusive_ptr<const Value> &v2) {
+	return (Value::compare(v1, v2) == 0);
+    }
+
+    /*
+      For performance reasons, there are various sharable static values
+      defined in class Value, obtainable by methods such as getUndefined(),
+      getTrue(), getOne(), etc.  We don't want these to go away as they are
+      used by a multitude of threads evaluating pipelines.  In order to avoid
+      having to use atomic integers in the intrusive reference counter, this
+      class overrides the reference counting methods to do nothing, making it
+      safe to use for static Values.
+
+      At this point, only the constructors necessary for the static Values in
+      common use have been defined.  The remainder can be defined if necessary.
+     */
+    class ValueStatic :
+        public Value {
+    public:
+	// virtuals from IntrusiveCounterUnsigned
+	virtual void addRef() const;
+	virtual void release() const;
+
+	// constructors
+	ValueStatic();
+	ValueStatic(BSONType type);
+	ValueStatic(bool boolValue);
+	ValueStatic(int intValue);
+    };
+}
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+    inline BSONType Value::getType() const {
+        return type;
+    }
+
+    inline size_t Value::getArrayLength() const {
+        assert(getType() == Array);
+	return vpValue.size();
+    }
+
+    inline intrusive_ptr<const Value> Value::getUndefined() {
+        return pFieldUndefined;
+    }
+
+    inline intrusive_ptr<const Value> Value::getNull() {
+        return pFieldNull;
+    }
+
+    inline intrusive_ptr<const Value> Value::getTrue() {
+        return pFieldTrue;
+    }
+
+    inline intrusive_ptr<const Value> Value::getFalse() {
+        return pFieldFalse;
+    }
+
+    inline intrusive_ptr<const Value> Value::getMinusOne() {
+        return pFieldMinusOne;
+    }
+
+    inline intrusive_ptr<const Value> Value::getZero() {
+        return pFieldZero;
+    }
+
+    inline intrusive_ptr<const Value> Value::getOne() {
+        return pFieldOne;
+    }
+
+    inline size_t Value::Hash::operator()(
+	const intrusive_ptr<const Value> &rV) const {
+	size_t seed = 0xf0afbeef;
+	rV->hash_combine(seed);
+	return seed;
+    }
+
+    inline ValueStatic::ValueStatic():
+	Value() {
+    }
+
+    inline ValueStatic::ValueStatic(BSONType type):
+	Value(type) {
+    }
+
+    inline ValueStatic::ValueStatic(bool boolValue):
+	Value(boolValue) {
+    }
+
+    inline ValueStatic::ValueStatic(int intValue):
+	Value(intValue) {
+    }
+
+};
diff --git a/src/mongo/db/projection.cpp b/src/mongo/db/projection.cpp
new file mode 100644
index 00000000000..d07e56527af
--- /dev/null
+++ b/src/mongo/db/projection.cpp
@@ -0,0 +1,301 @@
+// projection.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+#include "projection.h"
+#include "../util/mongoutils/str.h"
+
+namespace mongo {
+
+    void Projection::init( const BSONObj& o ) {
+        massert( 10371 , "can only add to Projection once", _source.isEmpty());
+        _source = o;
+
+        BSONObjIterator i( o );
+        int true_false = -1;
+        while ( i.more() ) {
+            BSONElement e = i.next();
+
+            if ( ! e.isNumber() )
+                _hasNonSimple = true;
+
+            if (e.type() == Object) {
+                BSONObj obj = e.embeddedObject();
+                BSONElement e2 = obj.firstElement();
+                if ( strcmp(e2.fieldName(), "$slice") == 0 ) {
+                    if (e2.isNumber()) {
+                        int i = e2.numberInt();
+                        if (i < 0)
+                            add(e.fieldName(), i, -i); // limit is now positive
+                        else
+                            add(e.fieldName(), 0, i);
+
+                    }
+                    else if (e2.type() == Array) {
+                        BSONObj arr = e2.embeddedObject();
+                        uassert(13099, "$slice array wrong size", arr.nFields() == 2 );
+
+                        BSONObjIterator it(arr);
+                        int skip = it.next().numberInt();
+                        int limit = it.next().numberInt();
+                        uassert(13100, "$slice limit must be positive", limit > 0 );
+                        add(e.fieldName(), skip, limit);
+
+                    }
+                    else {
+                        uassert(13098, "$slice only supports numbers and [skip, limit] arrays", false);
+                    }
+                }
+                else {
+                    uassert(13097, string("Unsupported projection option: ") + obj.firstElementFieldName(), false);
+                }
+
+            }
+            else if (!strcmp(e.fieldName(), "_id") && !e.trueValue()) {
+                _includeID = false;
+
+            }
+            else {
+
+                add (e.fieldName(), e.trueValue());
+
+                // validate input
+                if (true_false == -1) {
+                    true_false = e.trueValue();
+                    _include = !e.trueValue();
+                }
+                else {
+                    uassert( 10053 , "You cannot currently mix including and excluding fields. Contact us if this is an issue." ,
+                             (bool)true_false == e.trueValue() );
+                }
+            }
+        }
+    }
+
+    void Projection::add(const string& field, bool include) {
+        if (field.empty()) { // this is the field the user referred to
+            _include = include;
+        }
+        else {
+            _include = !include;
+
+            const size_t dot = field.find('.');
+            const string subfield = field.substr(0,dot);
+            const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos));
+
+            boost::shared_ptr<Projection>& fm = _fields[subfield];
+            if (!fm)
+                fm.reset(new Projection());
+
+            fm->add(rest, include);
+        }
+    }
+
+    void Projection::add(const string& field, int skip, int limit) {
+        _special = true; // can't include or exclude whole object
+
+        if (field.empty()) { // this is the field the user referred to
+            _skip = skip;
+            _limit = limit;
+        }
+        else {
+            const size_t dot = field.find('.');
+            const string subfield = field.substr(0,dot);
+            const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos));
+
+            boost::shared_ptr<Projection>& fm = _fields[subfield];
+            if (!fm)
+                fm.reset(new Projection());
+
+            fm->add(rest, skip, limit);
+        }
+    }
+
+    void Projection::transform( const BSONObj& in , BSONObjBuilder& b ) const {
+        BSONObjIterator i(in);
+        while ( i.more() ) {
+            BSONElement e = i.next();
+            if ( mongoutils::str::equals( "_id" , e.fieldName() ) ) {
+                if ( _includeID )
+                    b.append( e );
+            }
+            else {
+                append( b , e );
+            }
+        }
+    }
+
+    BSONObj Projection::transform( const BSONObj& in ) const {
+        BSONObjBuilder b;
+        transform( in , b );
+        return b.obj();
+    }
+
+
+    //b will be the value part of an array-typed BSONElement
+    void Projection::appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested) const {
+        int skip  = nested ?  0 : _skip;
+        int limit = nested ? -1 : _limit;
+
+        if (skip < 0) {
+            skip = max(0, skip + a.nFields());
+        }
+
+        int i=0;
+        BSONObjIterator it(a);
+        while (it.more()) {
+            BSONElement e = it.next();
+
+            if (skip) {
+                skip--;
+                continue;
+            }
+
+            if (limit != -1 && (limit-- == 0)) {
+                break;
+            }
+
+            switch(e.type()) {
+            case Array: {
+                BSONObjBuilder subb;
+                appendArray(subb , e.embeddedObject(), true);
+                b.appendArray(b.numStr(i++), subb.obj());
+                break;
+            }
+            case Object: {
+                BSONObjBuilder subb;
+                BSONObjIterator jt(e.embeddedObject());
+                while (jt.more()) {
+                    append(subb , jt.next());
+                }
+                b.append(b.numStr(i++), subb.obj());
+                break;
+            }
+            default:
+                if (_include)
+                    b.appendAs(e, b.numStr(i++));
+            }
+        }
+    }
+
+    void Projection::append( BSONObjBuilder& b , const BSONElement& e ) const {
+        FieldMap::const_iterator field = _fields.find( e.fieldName() );
+
+        if (field == _fields.end()) {
+            if (_include)
+                b.append(e);
+        }
+        else {
+            Projection& subfm = *field->second;
+
+            if ((subfm._fields.empty() && !subfm._special) || !(e.type()==Object || e.type()==Array) ) {
+                if (subfm._include)
+                    b.append(e);
+            }
+            else if (e.type() == Object) {
+                BSONObjBuilder subb;
+                BSONObjIterator it(e.embeddedObject());
+                while (it.more()) {
+                    subfm.append(subb, it.next());
+                }
+                b.append(e.fieldName(), subb.obj());
+
+            }
+            else { //Array
+                BSONObjBuilder subb;
+                subfm.appendArray(subb, e.embeddedObject());
+                b.appendArray(e.fieldName(), subb.obj());
+            }
+        }
+    }
+
+    Projection::KeyOnly* Projection::checkKey( const BSONObj& keyPattern ) const {
+        if ( _include ) {
+            // if we default to including then we can't
+            // use an index because we don't know what we're missing
+            return 0;
+        }
+
+        if ( _hasNonSimple )
+            return 0;
+
+        if ( _includeID && keyPattern["_id"].eoo() )
+            return 0;
+
+        // at this point we know its all { x : 1 } style
+
+        auto_ptr<KeyOnly> p( new KeyOnly() );
+
+        int got = 0;
+        BSONObjIterator i( keyPattern );
+        while ( i.more() ) {
+            BSONElement k = i.next();
+
+            if ( _source[k.fieldName()].type() ) {
+
+                if ( strchr( k.fieldName() , '.' ) ) {
+                    // TODO we currently don't support dotted fields
+                    //      SERVER-2104
+                    return 0;
+                }
+
+                if ( ! _includeID && mongoutils::str::equals( k.fieldName() , "_id" ) ) {
+                    p->addNo();
+                }
+                else {
+                    p->addYes( k.fieldName() );
+                    got++;
+                }
+            }
+            else if ( mongoutils::str::equals( "_id" , k.fieldName() ) && _includeID ) {
+                p->addYes( "_id" );
+            }
+            else {
+                p->addNo();
+            }
+
+        }
+
+        int need = _source.nFields();
+        if ( ! _includeID )
+            need--;
+
+        if ( got == need )
+            return p.release();
+
+        return 0;
+    }
+
+    BSONObj Projection::KeyOnly::hydrate( const BSONObj& key ) const {
+        assert( _include.size() == _names.size() );
+
+        BSONObjBuilder b( key.objsize() + _stringSize + 16 );
+
+        BSONObjIterator i(key);
+        unsigned n=0;
+        while ( i.more() ) {
+            assert( n < _include.size() );
+            BSONElement e = i.next();
+            if ( _include[n] ) {
+                b.appendAs( e , _names[n] );
+            }
+            n++;
+        }
+
+        return b.obj();
+    }
+}
diff --git a/src/mongo/db/projection.h b/src/mongo/db/projection.h
new file mode 100644
index 00000000000..b5e0a0c4289
--- /dev/null
+++ b/src/mongo/db/projection.h
@@ -0,0 +1,129 @@
+// projection.h
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "pch.h"
+#include "jsobj.h"
+
+namespace mongo {
+
+    /**
+     * given a document and a projection specification
+     * can transform the document
+     * currently supports specifying which fields and $slice
+     */
+    class Projection {
+    public:
+
+        class KeyOnly {
+        public:
+
+            KeyOnly() : _stringSize(0) {}
+
+            BSONObj hydrate( const BSONObj& key ) const;
+
+            void addNo() { _add( false , "" ); }
+            void addYes( const string& name ) { _add( true , name ); }
+
+        private:
+
+            void _add( bool b , const string& name ) {
+                _include.push_back( b );
+                _names.push_back( name );
+                _stringSize += name.size();
+            }
+
+            vector<bool> _include; // one entry per field in key.  true iff should be in output
+            vector<string> _names; // name of field since key doesn't have names
+
+            int _stringSize;
+        };
+
+        Projection() :
+            _include(true) ,
+            _special(false) ,
+            _includeID(true) ,
+            _skip(0) ,
+            _limit(-1) ,
+            _hasNonSimple(false) {
+        }
+
+        /**
+         * called once per lifetime
+         * e.g. { "x" : 1 , "a.y" : 1 }
+         */
+        void init( const BSONObj& spec );
+
+        /**
+         * @return the spec init was called with
+         */
+        BSONObj getSpec() const { return _source; }
+
+        /**
+         * transforms in according to spec
+         */
+        BSONObj transform( const BSONObj& in ) const;
+
+
+        /**
+         * transforms in according to spec
+         */
+        void transform( const BSONObj& in , BSONObjBuilder& b ) const;
+
+
+        /**
+         * @return if the keyPattern has all the information needed to return then
+         *         return a new KeyOnly otherwise null
+         *         NOTE: a key may have modified the actual data
+         *               which has to be handled above this (arrays, geo)
+         */
+        KeyOnly* checkKey( const BSONObj& keyPattern ) const;
+
+        bool includeID() const { return _includeID; }
+
+    private:
+
+        /**
+         * appends e to b if user wants it
+         * will descend into e if needed
+         */
+        void append( BSONObjBuilder& b , const BSONElement& e ) const;
+
+
+        void add( const string& field, bool include );
+        void add( const string& field, int skip, int limit );
+        void appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested=false) const;
+
+        bool _include; // true if default at this level is to include
+        bool _special; // true if this level can't be skipped or included without recursing
+
+        //TODO: benchmark vector<pair> vs map
+        typedef map<string, boost::shared_ptr<Projection> > FieldMap;
+        FieldMap _fields;
+        BSONObj _source;
+        bool _includeID;
+
+        // used for $slice operator
+        int _skip;
+        int _limit;
+
+        bool _hasNonSimple;
+    };
+
+
+}
diff --git a/src/mongo/db/queryoptimizer.cpp b/src/mongo/db/queryoptimizer.cpp
new file mode 100644
index 00000000000..9d9040d51e2
--- /dev/null
+++ b/src/mongo/db/queryoptimizer.cpp
@@ -0,0 +1,1337 @@
+// @file queryoptimizer.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db.h"
+#include "btree.h"
+#include "pdfile.h"
+#include "queryoptimizer.h"
+#include "cmdline.h"
+#include "clientcursor.h"
+
+//#define DEBUGQO(x) cout << x << endl;
+#define DEBUGQO(x)
+
+namespace mongo {
+
+    void checkTableScanAllowed( const char * ns ) {
+        if ( ! cmdLine.noTableScan )
+            return;
+
+        if ( strstr( ns , ".system." ) ||
+                strstr( ns , "local." ) )
+            return;
+
+        if ( ! nsdetails( ns ) )
+            return;
+
+        uassert( 10111 ,  (string)"table scans not allowed:" + ns , ! cmdLine.noTableScan );
+    }
+
+    double elementDirection( const BSONElement &e ) {
+        if ( e.isNumber() )
+            return e.number();
+        return 1;
+    }
+
+    QueryPlan::QueryPlan(
+        NamespaceDetails *d, int idxNo,
+        const FieldRangeSetPair &frsp, const FieldRangeSetPair *originalFrsp, const BSONObj &originalQuery, const BSONObj &order, bool mustAssertOnYieldFailure, const BSONObj &startKey, const BSONObj &endKey , string special ) :
+        _d(d), _idxNo(idxNo),
+        _frs( frsp.frsForIndex( _d, _idxNo ) ),
+        _frsMulti( frsp.frsForIndex( _d, -1 ) ),
+        _originalQuery( originalQuery ),
+        _order( order ),
+        _index( 0 ),
+        _optimal( false ),
+        _scanAndOrderRequired( true ),
+        _exactKeyMatch( false ),
+        _direction( 0 ),
+        _endKeyInclusive( endKey.isEmpty() ),
+        _unhelpful( false ),
+        _impossible( false ),
+        _special( special ),
+        _type(0),
+        _startOrEndSpec( !startKey.isEmpty() || !endKey.isEmpty() ),
+        _mustAssertOnYieldFailure( mustAssertOnYieldFailure ) {
+
+        BSONObj idxKey = _idxNo < 0 ? BSONObj() : d->idx( _idxNo ).keyPattern();
+            
+        if ( !_frs.matchPossibleForIndex( idxKey ) ) {
+            _impossible = true;
+            _scanAndOrderRequired = false;
+            return;
+        }
+            
+        if ( willScanTable() ) {
+            if ( _order.isEmpty() || !strcmp( _order.firstElementFieldName(), "$natural" ) )
+                _scanAndOrderRequired = false;
+            return;                
+        }
+            
+        _index = &d->idx(_idxNo);
+
+        // If the parsing or index indicates this is a special query, don't continue the processing
+        if ( _special.size() ||
+            ( _index->getSpec().getType() && _index->getSpec().getType()->suitability( originalQuery, order ) != USELESS ) ) {
+
+            if( _special.size() ) _optimal = true;
+
+            _type  = _index->getSpec().getType();
+            if( !_special.size() ) _special = _index->getSpec().getType()->getPlugin()->getName();
+
+            massert( 13040 , (string)"no type for special: " + _special , _type );
+            // hopefully safe to use original query in these contexts - don't think we can mix special with $or clause separation yet
+            _scanAndOrderRequired = _type->scanAndOrderRequired( _originalQuery , order );
+            return;
+        }
+
+        const IndexSpec &idxSpec = _index->getSpec();
+        BSONObjIterator o( order );
+        BSONObjIterator k( idxKey );
+        if ( !o.moreWithEOO() )
+            _scanAndOrderRequired = false;
+        while( o.moreWithEOO() ) {
+            BSONElement oe = o.next();
+            if ( oe.eoo() ) {
+                _scanAndOrderRequired = false;
+                break;
+            }
+            if ( !k.moreWithEOO() )
+                break;
+            BSONElement ke;
+            while( 1 ) {
+                ke = k.next();
+                if ( ke.eoo() )
+                    goto doneCheckOrder;
+                if ( strcmp( oe.fieldName(), ke.fieldName() ) == 0 )
+                    break;
+                if ( !_frs.range( ke.fieldName() ).equality() )
+                    goto doneCheckOrder;
+            }
+            int d = elementDirection( oe ) == elementDirection( ke ) ? 1 : -1;
+            if ( _direction == 0 )
+                _direction = d;
+            else if ( _direction != d )
+                break;
+        }
+doneCheckOrder:
+        if ( _scanAndOrderRequired )
+            _direction = 0;
+        BSONObjIterator i( idxKey );
+        int exactIndexedQueryCount = 0;
+        int optimalIndexedQueryCount = 0;
+        bool stillOptimalIndexedQueryCount = true;
+        set<string> orderFieldsUnindexed;
+        order.getFieldNames( orderFieldsUnindexed );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            const FieldRange &fr = _frs.range( e.fieldName() );
+            if ( stillOptimalIndexedQueryCount ) {
+                if ( fr.nontrivial() )
+                    ++optimalIndexedQueryCount;
+                if ( !fr.equality() )
+                    stillOptimalIndexedQueryCount = false;
+            }
+            else {
+                if ( fr.nontrivial() )
+                    optimalIndexedQueryCount = -1;
+            }
+            if ( fr.equality() ) {
+                BSONElement e = fr.max();
+                if ( !e.isNumber() && !e.mayEncapsulate() && e.type() != RegEx )
+                    ++exactIndexedQueryCount;
+            }
+            orderFieldsUnindexed.erase( e.fieldName() );
+        }
+        if ( !_scanAndOrderRequired &&
+                ( optimalIndexedQueryCount == _frs.nNontrivialRanges() ) )
+            _optimal = true;
+        if ( exactIndexedQueryCount == _frs.nNontrivialRanges() &&
+                orderFieldsUnindexed.size() == 0 &&
+                exactIndexedQueryCount == idxKey.nFields() &&
+                exactIndexedQueryCount == _originalQuery.nFields() ) {
+            _exactKeyMatch = true;
+        }
+        _frv.reset( new FieldRangeVector( _frs, idxSpec, _direction ) );
+        if ( originalFrsp ) {
+            _originalFrv.reset( new FieldRangeVector( originalFrsp->frsForIndex( _d, _idxNo ), idxSpec, _direction ) );
+        }
+        else {
+            _originalFrv = _frv;
+        }
+        if ( _startOrEndSpec ) {
+            BSONObj newStart, newEnd;
+            if ( !startKey.isEmpty() )
+                _startKey = startKey;
+            else
+                _startKey = _frv->startKey();
+            if ( !endKey.isEmpty() )
+                _endKey = endKey;
+            else
+                _endKey = _frv->endKey();
+        }
+
+        if ( ( _scanAndOrderRequired || _order.isEmpty() ) &&
+                !_frs.range( idxKey.firstElementFieldName() ).nontrivial() ) {
+            _unhelpful = true;
+        }
+    }
+
+    shared_ptr<Cursor> QueryPlan::newCursor( const DiskLoc &startLoc , int numWanted ) const {
+
+        if ( _type ) {
+            // hopefully safe to use original query in these contexts - don't think we can mix type with $or clause separation yet
+            return _type->newCursor( _originalQuery , _order , numWanted );
+        }
+
+        if ( _impossible ) {
+            // TODO We might want to allow this dummy table scan even in no table
+            // scan mode, since it won't scan anything.
+            if ( _frs.nNontrivialRanges() )
+                checkTableScanAllowed( _frs.ns() );
+            return shared_ptr<Cursor>( new BasicCursor( DiskLoc() ) );
+        }
+
+        if ( willScanTable() ) {
+            if ( _frs.nNontrivialRanges() ) {
+                checkTableScanAllowed( _frs.ns() );
+                
+                // if we are doing a table scan on _id
+                // and it's a capped collection
+                // we warn /*disallow*/ as it's a common user error
+                // .system. and local collections are exempt
+                if ( _d && _d->capped && _frs.range( "_id" ).nontrivial() ) {
+                    if ( cc().isSyncThread() ||
+                         str::contains( _frs.ns() , ".system." ) || 
+                         str::startsWith( _frs.ns() , "local." ) ) {
+                        // ok
+                    }
+                    else {
+                        warning() << "_id query on capped collection without an _id index, performance will be poor collection: " << _frs.ns() << endl;
+                        //uassert( 14820, str::stream() << "doing _id query on a capped collection without an index is not allowed: " << _frs.ns() ,
+                    }
+                }
+            }
+            return findTableScan( _frs.ns(), _order, startLoc );
+        }
+                
+        massert( 10363 ,  "newCursor() with start location not implemented for indexed plans", startLoc.isNull() );
+
+        if ( _startOrEndSpec ) {
+            // we are sure to spec _endKeyInclusive
+            return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _startKey, _endKey, _endKeyInclusive, _direction >= 0 ? 1 : -1 ) );
+        }
+        else if ( _index->getSpec().getType() ) {
+            return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _frv->startKey(), _frv->endKey(), true, _direction >= 0 ? 1 : -1 ) );
+        }
+        else {
+            return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _frv, _direction >= 0 ? 1 : -1 ) );
+        }
+    }
+
+    shared_ptr<Cursor> QueryPlan::newReverseCursor() const {
+        if ( willScanTable() ) {
+            int orderSpec = _order.getIntField( "$natural" );
+            if ( orderSpec == INT_MIN )
+                orderSpec = 1;
+            return findTableScan( _frs.ns(), BSON( "$natural" << -orderSpec ) );
+        }
+        massert( 10364 ,  "newReverseCursor() not implemented for indexed plans", false );
+        return shared_ptr<Cursor>();
+    }
+
+    BSONObj QueryPlan::indexKey() const {
+        if ( !_index )
+            return BSON( "$natural" << 1 );
+        return _index->keyPattern();
+    }
+
+    void QueryPlan::registerSelf( long long nScanned ) const {
+        // Impossible query constraints can be detected before scanning, and we
+        // don't have a reserved pattern enum value for impossible constraints.
+        if ( _impossible ) {
+            return;
+        }
+
+        SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+        NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( _frs.pattern( _order ), indexKey(), nScanned );
+    }
+    
+    /**
+     * @return a copy of the inheriting class, which will be run with its own
+     * query plan.  If multiple plan sets are required for an $or query, the
+     * QueryOp of the winning plan from a given set will be cloned to generate
+     * QueryOps for the subsequent plan set.  This function should only be called
+     * after the query op has completed executing.
+     */    
+    QueryOp *QueryOp::createChild() {
+        if( _orConstraint.get() ) {
+            _matcher->advanceOrClause( _orConstraint );
+            _orConstraint.reset();
+        }
+        QueryOp *ret = _createChild();
+        ret->_oldMatcher = _matcher;
+        return ret;
+    }    
+
+    bool QueryPlan::isMultiKey() const {
+        if ( _idxNo < 0 )
+            return false;
+        return _d->isMultikey( _idxNo );
+    }
+    
+    void QueryOp::init() {
+        if ( _oldMatcher.get() ) {
+            _matcher.reset( _oldMatcher->nextClauseMatcher( qp().indexKey() ) );
+        }
+        else {
+            _matcher.reset( new CoveredIndexMatcher( qp().originalQuery(), qp().indexKey(), alwaysUseRecord() ) );
+        }
+        _init();
+    }    
+
+    QueryPlanSet::QueryPlanSet( const char *ns, auto_ptr<FieldRangeSetPair> frsp, auto_ptr<FieldRangeSetPair> originalFrsp, const BSONObj &originalQuery, const BSONObj &order, bool mustAssertOnYieldFailure, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max, bool bestGuessOnly, bool mayYield ) :
+        _ns(ns),
+        _originalQuery( originalQuery ),
+        _frsp( frsp ),
+        _originalFrsp( originalFrsp ),
+        _mayRecordPlan( false ),
+        _usingCachedPlan( false ),
+        _hint( BSONObj() ),
+        _order( order.getOwned() ),
+        _oldNScanned( 0 ),
+        _honorRecordedPlan( honorRecordedPlan ),
+        _min( min.getOwned() ),
+        _max( max.getOwned() ),
+        _bestGuessOnly( bestGuessOnly ),
+        _mayYield( mayYield ),
+	    _yieldSometimesTracker( 256, 20 ),
+        _mustAssertOnYieldFailure( mustAssertOnYieldFailure ) {
+        if ( hint && !hint->eoo() ) {
+            _hint = hint->wrap();
+        }
+        init();
+    }
+
+    bool QueryPlanSet::modifiedKeys() const {
+        for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i )
+            if ( (*i)->isMultiKey() )
+                return true;
+        return false;
+    }
+
+    bool QueryPlanSet::hasMultiKey() const {
+        for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i )
+            if ( (*i)->isMultiKey() )
+                return true;
+        return false;
+    }
+
+
+    void QueryPlanSet::addHint( IndexDetails &id ) {
+        if ( !_min.isEmpty() || !_max.isEmpty() ) {
+            string errmsg;
+            BSONObj keyPattern = id.keyPattern();
+            // This reformats _min and _max to be used for index lookup.
+            massert( 10365 ,  errmsg, indexDetailsForRange( _frsp->ns(), errmsg, _min, _max, keyPattern ) );
+        }
+        NamespaceDetails *d = nsdetails(_ns);
+        _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(id), *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure, _min, _max ) ) );
+    }
+
+    // returns an IndexDetails * for a hint, 0 if hint is $natural.
+    // hint must not be eoo()
+    IndexDetails *parseHint( const BSONElement &hint, NamespaceDetails *d ) {
+        massert( 13292, "hint eoo", !hint.eoo() );
+        if( hint.type() == String ) {
+            string hintstr = hint.valuestr();
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                IndexDetails& ii = i.next();
+                if ( ii.indexName() == hintstr ) {
+                    return &ii;
+                }
+            }
+        }
+        else if( hint.type() == Object ) {
+            BSONObj hintobj = hint.embeddedObject();
+            uassert( 10112 ,  "bad hint", !hintobj.isEmpty() );
+            if ( !strcmp( hintobj.firstElementFieldName(), "$natural" ) ) {
+                return 0;
+            }
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                IndexDetails& ii = i.next();
+                if( ii.keyPattern().woCompare(hintobj) == 0 ) {
+                    return &ii;
+                }
+            }
+        }
+        uassert( 10113 ,  "bad hint", false );
+        return 0;
+    }
+
+    void QueryPlanSet::init() {
+        DEBUGQO( "QueryPlanSet::init " << ns << "\t" << _originalQuery );
+        _runner.reset();
+        _plans.clear();
+        _usingCachedPlan = false;
+
+        const char *ns = _frsp->ns();
+        NamespaceDetails *d = nsdetails( ns );
+        if ( !d || !_frsp->matchPossible() ) {
+            // Table scan plan, when no matches are possible
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
+            return;
+        }
+
+        BSONElement hint = _hint.firstElement();
+        if ( !hint.eoo() ) {
+            IndexDetails *id = parseHint( hint, d );
+            if ( id ) {
+                addHint( *id );
+            }
+            else {
+                massert( 10366 ,  "natural order cannot be specified with $min/$max", _min.isEmpty() && _max.isEmpty() );
+                // Table scan plan
+                _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
+            }
+            return;
+        }
+
+        if ( !_min.isEmpty() || !_max.isEmpty() ) {
+            string errmsg;
+            BSONObj keyPattern;
+            IndexDetails *idx = indexDetailsForRange( ns, errmsg, _min, _max, keyPattern );
+            massert( 10367 ,  errmsg, idx );
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(*idx), *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure, _min, _max ) ) );
+            return;
+        }
+
+        if ( isSimpleIdQuery( _originalQuery ) ) {
+            int idx = d->findIdIndex();
+            if ( idx >= 0 ) {
+                _plans.push_back( QueryPlanPtr( new QueryPlan( d , idx , *_frsp , _originalFrsp.get() , _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
+                return;
+            }
+        }
+
+        if ( _originalQuery.isEmpty() && _order.isEmpty() ) {
+            _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) );
+            return;
+        }
+
+        DEBUGQO( "\t special : " << _frsp->getSpecial() );
+        if ( _frsp->getSpecial().size() ) {
+            _special = _frsp->getSpecial();
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                int j = i.pos();
+                IndexDetails& ii = i.next();
+                const IndexSpec& spec = ii.getSpec();
+                if ( spec.getTypeName() == _special && spec.suitability( _originalQuery , _order ) ) {
+                    _plans.push_back( QueryPlanPtr( new QueryPlan( d , j , *_frsp , _originalFrsp.get() , _originalQuery, _order ,
+                                                    _mustAssertOnYieldFailure , BSONObj() , BSONObj() , _special ) ) );
+                    return;
+                }
+            }
+            uassert( 13038 , (string)"can't find special index: " + _special + " for: " + _originalQuery.toString() , 0 );
+        }
+
+        if ( _honorRecordedPlan ) {
+            pair< BSONObj, long long > best = QueryUtilIndexed::bestIndexForPatterns( *_frsp, _order );
+            BSONObj bestIndex = best.first;
+            long long oldNScanned = best.second;
+            if ( !bestIndex.isEmpty() ) {
+                QueryPlanPtr p;
+                _oldNScanned = oldNScanned;
+                if ( !strcmp( bestIndex.firstElementFieldName(), "$natural" ) ) {
+                    // Table scan plan
+                    p.reset( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
+                }
+
+                NamespaceDetails::IndexIterator i = d->ii();
+                while( i.more() ) {
+                    int j = i.pos();
+                    IndexDetails& ii = i.next();
+                    if( ii.keyPattern().woCompare(bestIndex) == 0 ) {
+                        p.reset( new QueryPlan( d, j, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
+                    }
+                }
+
+                massert( 10368 ,  "Unable to locate previously recorded index", p.get() );
+                if ( !( _bestGuessOnly && p->scanAndOrderRequired() ) ) {
+                    _usingCachedPlan = true;
+                    _plans.push_back( p );
+                    return;
+                }
+            }
+        }
+
+        addOtherPlans( false );
+    }
+
+    void QueryPlanSet::addOtherPlans( bool checkFirst ) {
+        const char *ns = _frsp->ns();
+        NamespaceDetails *d = nsdetails( ns );
+        if ( !d )
+            return;
+
+        // If table scan is optimal or natural order requested or tailable cursor requested
+        if ( !_frsp->matchPossible() || ( _frsp->noNontrivialRanges() && _order.isEmpty() ) ||
+                ( !_order.isEmpty() && !strcmp( _order.firstElementFieldName(), "$natural" ) ) ) {
+            // Table scan plan
+            addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ), checkFirst );
+            return;
+        }
+
+        bool normalQuery = _hint.isEmpty() && _min.isEmpty() && _max.isEmpty();
+
+        PlanSet plans;
+        QueryPlanPtr optimalPlan;
+        QueryPlanPtr specialPlan;
+        for( int i = 0; i < d->nIndexes; ++i ) {
+            if ( normalQuery ) {
+                BSONObj keyPattern = d->idx( i ).keyPattern();
+                if ( !_frsp->matchPossibleForIndex( d, i, keyPattern ) ) {
+                    // If no match is possible, only generate a trival plan that won't
+                    // scan any documents.
+                    QueryPlanPtr p( new QueryPlan( d, i, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
+                    addPlan( p, checkFirst );
+                    return;
+                }
+                if ( !QueryUtilIndexed::indexUseful( *_frsp, d, i, _order ) ) {
+                    continue;
+                }
+            }
+
+            QueryPlanPtr p( new QueryPlan( d, i, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) );
+            if ( p->optimal() ) {
+                if ( !optimalPlan.get() ) {
+                    optimalPlan = p;
+                }
+            }
+            else if ( !p->unhelpful() ) {
+                if ( p->special().empty() ) {
+                    plans.push_back( p );
+                }
+                else {
+                    specialPlan = p;
+                }
+            }
+        }
+        if ( optimalPlan.get() ) {
+            addPlan( optimalPlan, checkFirst );
+            return;
+        }
+        for( PlanSet::const_iterator i = plans.begin(); i != plans.end(); ++i ) {
+            addPlan( *i, checkFirst );
+        }
+
+        // Only add a special plan if no standard btree plans have been added. SERVER-4531
+        if ( plans.empty() && specialPlan ) {
+            addPlan( specialPlan, checkFirst );
+            return;
+        }
+
+        // Table scan plan
+        addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ), checkFirst );
+        _mayRecordPlan = true;
+    }
+
+    shared_ptr<QueryOp> QueryPlanSet::runOp( QueryOp &op ) {
+        if ( _usingCachedPlan ) {
+            Runner r( *this, op );
+            shared_ptr<QueryOp> res = r.runUntilFirstCompletes();
+            // _plans.size() > 1 if addOtherPlans was called in Runner::runUntilFirstCompletes().
+            if ( _bestGuessOnly || res->complete() || _plans.size() > 1 )
+                return res;
+            // A cached plan was used, so clear the plan for this query pattern and retry the query without a cached plan.
+            // Carefull here, as the namespace may have been dropped.
+            QueryUtilIndexed::clearIndexesForPatterns( *_frsp, _order );
+            init();
+        }
+        Runner r( *this, op );
+        return r.runUntilFirstCompletes();
+    }
+    
+    shared_ptr<QueryOp> QueryPlanSet::nextOp( QueryOp &originalOp, bool retried ) {
+        if ( !_runner ) {
+            _runner.reset( new Runner( *this, originalOp ) );
+            shared_ptr<QueryOp> op = _runner->init();
+            if ( op->complete() ) {
+                return op;   
+            }
+        }
+        shared_ptr<QueryOp> op = _runner->nextNonError();
+        if ( !op->error() ) {
+            return op;   
+        }
+        if ( !_usingCachedPlan || _bestGuessOnly || _plans.size() > 1 ) {
+            return op;
+        }
+
+        // Avoid an infinite loop here - this should never occur.
+        verify( 15878, !retried );
+
+        // A cached plan was used, so clear the plan for this query pattern and retry the query without a cached plan.
+        QueryUtilIndexed::clearIndexesForPatterns( *_frsp, _order );
+        init();
+        return nextOp( originalOp, true );
+    }
+
+    bool QueryPlanSet::prepareToYield() {
+        return _runner ? _runner->prepareToYield() : true;   
+    }
+    
+    void QueryPlanSet::recoverFromYield() {
+        if ( _runner ) {
+            _runner->recoverFromYield();   
+        }
+    }
+    
+    void QueryPlanSet::clearRunner() {
+        if ( _runner ) {
+            _runner.reset();
+        }
+    }
+    
+    BSONObj QueryPlanSet::explain() const {
+        vector<BSONObj> arr;
+        for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i ) {
+            shared_ptr<Cursor> c = (*i)->newCursor();
+            BSONObjBuilder explain;
+            explain.append( "cursor", c->toString() );
+            explain.append( "indexBounds", c->prettyIndexBounds() );
+            arr.push_back( explain.obj() );
+        }
+        BSONObjBuilder b;
+        b.append( "allPlans", arr );
+        return b.obj();
+    }
+
+    QueryPlanSet::QueryPlanPtr QueryPlanSet::getBestGuess() const {
+        assert( _plans.size() );
+        if ( _plans[ 0 ]->scanAndOrderRequired() ) {
+            for ( unsigned i=1; i<_plans.size(); i++ ) {
+                if ( ! _plans[i]->scanAndOrderRequired() )
+                    return _plans[i];
+            }
+
+            warning() << "best guess query plan requested, but scan and order are required for all plans "
+            		  << " query: " << _originalQuery
+            		  << " order: " << _order
+            		  << " choices: ";
+
+            for ( unsigned i=0; i<_plans.size(); i++ )
+            	warning() << _plans[i]->indexKey() << " ";
+            warning() << endl;
+
+            return QueryPlanPtr();
+        }
+        return _plans[0];
+    }
+
+    QueryPlanSet::Runner::Runner( QueryPlanSet &plans, QueryOp &op ) :
+        _op( op ),
+        _plans( plans ) {
+    }
+
+    bool QueryPlanSet::Runner::prepareToYield() {
+        for( vector<shared_ptr<QueryOp> >::const_iterator i = _ops.begin(); i != _ops.end(); ++i ) {
+            if ( !prepareToYieldOp( **i ) ) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    void QueryPlanSet::Runner::recoverFromYield() {
+        for( vector<shared_ptr<QueryOp> >::const_iterator i = _ops.begin(); i != _ops.end(); ++i ) {
+            recoverFromYieldOp( **i );
+        }        
+    }
+
+    void QueryPlanSet::Runner::mayYield() {
+        if ( ! _plans._mayYield ) 
+            return;
+        
+        if ( ! _plans._yieldSometimesTracker.intervalHasElapsed() ) 
+            return;
+        
+        int micros = ClientCursor::suggestYieldMicros();
+        if ( micros <= 0 ) 
+            return;
+
+        if ( !prepareToYield() ) 
+            return;   
+        
+        ClientCursor::staticYield( micros , _plans._ns , 0 );
+        recoverFromYield();
+    }
+
+    shared_ptr<QueryOp> QueryPlanSet::Runner::init() {
+        massert( 10369 ,  "no plans", _plans._plans.size() > 0 );
+        
+        if ( _plans._bestGuessOnly ) {
+            shared_ptr<QueryOp> op( _op.createChild() );
+            shared_ptr<QueryPlan> plan = _plans.getBestGuess();
+            massert( 15894, "no index matches QueryPlanSet's sort with _bestGuessOnly", plan.get() );
+            op->setQueryPlan( plan.get() );
+            _ops.push_back( op );
+        }
+        else {
+            if ( _plans._plans.size() > 1 )
+                log(1) << "  running multiple plans" << endl;
+            for( PlanSet::iterator i = _plans._plans.begin(); i != _plans._plans.end(); ++i ) {
+                shared_ptr<QueryOp> op( _op.createChild() );
+                op->setQueryPlan( i->get() );
+                _ops.push_back( op );
+            }
+        }
+        
+        // Initialize ops.
+        for( vector<shared_ptr<QueryOp> >::iterator i = _ops.begin(); i != _ops.end(); ++i ) {
+            initOp( **i );
+            if ( (*i)->complete() )
+                return *i;
+        }
+        
+        // Put runnable ops in the priority queue.
+        for( vector<shared_ptr<QueryOp> >::iterator i = _ops.begin(); i != _ops.end(); ++i ) {
+            if ( !(*i)->error() ) {
+                _queue.push( *i );
+            }
+        }
+        
+        return *_ops.begin();
+    }
+    
+    shared_ptr<QueryOp> QueryPlanSet::Runner::nextNonError() {
+        if ( _queue.empty() ) {
+            return *_ops.begin();   
+        }
+        shared_ptr<QueryOp> ret;
+        do {
+            ret = next();
+        } while( ret->error() && !_queue.empty() );
+        return ret;
+    }
+    
+    shared_ptr<QueryOp> QueryPlanSet::Runner::next() {
+        mayYield();
+        dassert( !_queue.empty() );
+        OpHolder holder = _queue.pop();
+        QueryOp &op = *holder._op;
+        nextOp( op );
+        if ( op.complete() ) {
+            if ( _plans._mayRecordPlan && op.mayRecordPlan() ) {
+                op.qp().registerSelf( op.nscanned() );
+            }
+            return holder._op;
+        }
+        if ( op.error() ) {
+            return holder._op;
+        }
+        if ( !_plans._bestGuessOnly && _plans._usingCachedPlan && op.nscanned() > _plans._oldNScanned * 10 && _plans._special.empty() ) {
+            holder._offset = -op.nscanned();
+            _plans.addOtherPlans( /* avoid duplicating the initial plan */ true );
+            PlanSet::iterator i = _plans._plans.begin();
+            ++i;
+            for( ; i != _plans._plans.end(); ++i ) {
+                shared_ptr<QueryOp> op( _op.createChild() );
+                op->setQueryPlan( i->get() );
+                _ops.push_back( op );
+                initOp( *op );
+                if ( op->complete() )
+                    return op;
+                _queue.push( op );
+            }
+            _plans._usingCachedPlan = false;
+        }
+        _queue.push( holder );
+        return holder._op;
+    }
+    
+    shared_ptr<QueryOp> QueryPlanSet::Runner::runUntilFirstCompletes() {
+        shared_ptr<QueryOp> potentialFinisher = init();
+        if ( potentialFinisher->complete() ) {
+         	return potentialFinisher;
+        }
+        
+        while( !_queue.empty() ) {
+            shared_ptr<QueryOp> potentialFinisher = next();
+            if ( potentialFinisher->complete() ) {
+                return potentialFinisher;
+            }
+        }
+        return _ops[ 0 ];
+    }
+
+#define GUARD_OP_EXCEPTION( op, expression ) \
+    try { \
+        expression; \
+    } \
+    catch ( DBException& e ) { \
+        op.setException( e.getInfo() ); \
+    } \
+    catch ( const std::exception &e ) { \
+        op.setException( ExceptionInfo( e.what() , 0 ) ); \
+    } \
+    catch ( ... ) { \
+        op.setException( ExceptionInfo( "Caught unknown exception" , 0 ) ); \
+    }
+
+
+    void QueryPlanSet::Runner::initOp( QueryOp &op ) {
+        GUARD_OP_EXCEPTION( op, op.init() );
+    }
+
+    void QueryPlanSet::Runner::nextOp( QueryOp &op ) {
+        GUARD_OP_EXCEPTION( op, if ( !op.error() ) { op.next(); } );
+    }
+
+    bool QueryPlanSet::Runner::prepareToYieldOp( QueryOp &op ) {
+        GUARD_OP_EXCEPTION( op,
+        if ( op.error() ) {
+            return true;
+        }
+        else {
+            return op.prepareToYield();
+        } );
+        return true;
+    }
+
+    void QueryPlanSet::Runner::recoverFromYieldOp( QueryOp &op ) {
+        GUARD_OP_EXCEPTION( op, if ( !op.error() ) { op.recoverFromYield(); } );
+    }
+
+    /**
+     * NOTE on our $or implementation: In our current qo implementation we don't
+     * keep statistics on our data, but we can conceptualize the problem of
+     * selecting an index when statistics exist for all index ranges.  The
+     * d-hitting set problem on k sets and n elements can be reduced to the
+     * problem of index selection on k $or clauses and n index ranges (where
+     * d is the max number of indexes, and the number of ranges n is unbounded).
+     * In light of the fact that d-hitting set is np complete, and we don't even
+     * track statistics (so cost calculations are expensive) our first
+     * implementation uses the following greedy approach: We take one $or clause
+     * at a time and treat each as a separate query for index selection purposes.
+     * But if an index range is scanned for a particular $or clause, we eliminate
+     * that range from all subsequent clauses.  One could imagine an opposite
+     * implementation where we select indexes based on the union of index ranges
+     * for all $or clauses, but this can have much poorer worst case behavior.
+     * (An index range that suits one $or clause may not suit another, and this
+     * is worse than the typical case of index range choice staleness because
+     * with $or the clauses may likely be logically distinct.)  The greedy
+     * implementation won't do any worse than all the $or clauses individually,
+     * and it can often do better.  In the first cut we are intentionally using
+     * QueryPattern tracking to record successful plans on $or clauses for use by
+     * subsequent $or clauses, even though there may be a significant aggregate
+     * $nor component that would not be represented in QueryPattern.    
+     */
+    
+    MultiPlanScanner::MultiPlanScanner( const char *ns,
+                                        const BSONObj &query,
+                                        const BSONObj &order,
+                                        const BSONElement *hint,
+                                        bool honorRecordedPlan,
+                                        const BSONObj &min,
+                                        const BSONObj &max,
+                                        bool bestGuessOnly,
+                                        bool mayYield ) :
+        _ns( ns ),
+        _or( !query.getField( "$or" ).eoo() ),
+        _query( query.getOwned() ),
+        _i(),
+        _honorRecordedPlan( honorRecordedPlan ),
+        _bestGuessOnly( bestGuessOnly ),
+        _hint( ( hint && !hint->eoo() ) ? hint->wrap() : BSONObj() ),
+        _mayYield( mayYield ),
+        _tableScanned() {
+        if ( !order.isEmpty() || !min.isEmpty() || !max.isEmpty() ) {
+            _or = false;
+        }
+        if ( _or ) {
+            // Only construct an OrRangeGenerator if we may handle $or clauses.
+            _org.reset( new OrRangeGenerator( ns, _query ) );
+            if ( !_org->getSpecial().empty() ) {
+                _or = false;
+            }
+            else if ( uselessOr( _hint.firstElement() ) ) {
+                _or = false;   
+            }
+        }
+        // if _or == false, don't use or clauses for index selection
+        if ( !_or ) {
+            auto_ptr<FieldRangeSetPair> frsp( new FieldRangeSetPair( ns, _query, true ) );
+            _currentQps.reset( new QueryPlanSet( ns, frsp, auto_ptr<FieldRangeSetPair>(), _query, order, false, hint, honorRecordedPlan, min, max, _bestGuessOnly, _mayYield ) );
+        }
+        else {
+            BSONElement e = _query.getField( "$or" );
+            massert( 13268, "invalid $or spec", e.type() == Array && e.embeddedObject().nFields() > 0 );
+        }
+    }
+
+    shared_ptr<QueryOp> MultiPlanScanner::runOpOnce( QueryOp &op ) {
+        assertMayRunMore();
+        if ( !_or ) {
+            ++_i;
+            return _currentQps->runOp( op );
+        }
+        ++_i;
+        auto_ptr<FieldRangeSetPair> frsp( _org->topFrsp() );
+        auto_ptr<FieldRangeSetPair> originalFrsp( _org->topFrspOriginal() );
+        BSONElement hintElt = _hint.firstElement();
+        _currentQps.reset( new QueryPlanSet( _ns, frsp, originalFrsp, _query, BSONObj(), true, &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) );
+        shared_ptr<QueryOp> ret( _currentQps->runOp( op ) );
+        if ( ! ret->complete() )
+            throw MsgAssertionException( ret->exception() );
+        if ( ret->qp().willScanTable() ) {
+            _tableScanned = true;
+        } else {
+            // If the full table was scanned, don't bother popping the last or clause.
+	        _org->popOrClause( ret->qp().nsd(), ret->qp().idxNo(), ret->qp().indexed() ? ret->qp().indexKey() : BSONObj() );
+        }
+        return ret;
+    }
+
+    shared_ptr<QueryOp> MultiPlanScanner::runOp( QueryOp &op ) {
+        shared_ptr<QueryOp> ret = runOpOnce( op );
+        while( !ret->stopRequested() && mayRunMore() ) {
+            ret = runOpOnce( *ret );
+        }
+        return ret;
+    }
+    
+    shared_ptr<QueryOp> MultiPlanScanner::nextOpHandleEndOfClause() {
+        shared_ptr<QueryOp> op = _currentQps->nextOp( *_baseOp );
+        if ( !op->complete() ) {
+            return op;   
+        }
+        if ( op->qp().willScanTable() ) {
+            _tableScanned = true;   
+        } else {
+            _org->popOrClause( op->qp().nsd(), op->qp().idxNo(), op->qp().indexed() ? op->qp().indexKey() : BSONObj() );         	   
+        }
+        return op;
+    }
+    
+    shared_ptr<QueryOp> MultiPlanScanner::nextOpBeginningClause() {
+        assertMayRunMore();
+        shared_ptr<QueryOp> op;
+        while( mayRunMore() ) {
+	        ++_i;
+    	    auto_ptr<FieldRangeSetPair> frsp( _org->topFrsp() );
+        	auto_ptr<FieldRangeSetPair> originalFrsp( _org->topFrspOriginal() );
+	        BSONElement hintElt = _hint.firstElement();
+    	    _currentQps.reset( new QueryPlanSet( _ns, frsp, originalFrsp, _query, BSONObj(), true, &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) );
+            op = nextOpHandleEndOfClause();
+            if ( !op->complete() ) {
+             	return op;
+            }
+            _baseOp = op;
+        }
+        return op;
+    }
+
+    shared_ptr<QueryOp> MultiPlanScanner::nextOp() {
+        if ( !_or ) {
+            if ( _i == 0 ) {
+                assertMayRunMore();
+	         	++_i;
+            }            
+            return _currentQps->nextOp( *_baseOp );   
+        }
+        if ( _i == 0 ) {
+            return nextOpBeginningClause();
+        }
+        shared_ptr<QueryOp> op = nextOpHandleEndOfClause();
+        if ( !op->complete() ) {
+            return op;   
+        }
+        if ( !op->stopRequested() && mayRunMore() ) {
+            // Finished scanning the clause, but stop hasn't been requested.
+            // Start scanning the next clause.
+            _baseOp = op;
+            return nextOpBeginningClause();
+        }
+        return op;
+    }
+    
+    bool MultiPlanScanner::prepareToYield() {
+        return _currentQps.get() ? _currentQps->prepareToYield() : true;
+    }
+    
+    void MultiPlanScanner::recoverFromYield() {
+        if ( _currentQps.get() ) {
+            _currentQps->recoverFromYield();
+        }
+    }
+
+    void MultiPlanScanner::clearRunner() {
+        if ( _currentQps.get() ) {
+            _currentQps->clearRunner();
+        }    
+    }
+    
+    int MultiPlanScanner::currentNPlans() const {
+        return _currentQps.get() ? _currentQps->nPlans() : 0;
+    }
+
+    shared_ptr<Cursor> MultiPlanScanner::singleCursor() const {
+        const QueryPlan *qp = singlePlan();
+        if ( !qp ) {
+            return shared_ptr<Cursor>();            
+        }
+        // If there is only one plan and it does not require an in memory
+        // sort, we do not expect its cursor op to throw an exception and
+        // so do not need a QueryOptimizerCursor to handle this case.
+        return qp->newCursor();
+    }
+
+    const QueryPlan *MultiPlanScanner::singlePlan() const {
+        if ( _or || _currentQps->nPlans() != 1 || _currentQps->firstPlan()->scanAndOrderRequired() || _currentQps->usingCachedPlan() ) {
+            return 0;
+        }
+        return _currentQps->firstPlan().get();
+    }
+
+    bool MultiPlanScanner::uselessOr( const BSONElement &hint ) const {
+        NamespaceDetails *nsd = nsdetails( _ns );
+        if ( !nsd ) {
+            return true;
+        }
+        if ( !hint.eoo() ) {
+            IndexDetails *id = parseHint( hint, nsd );
+            if ( !id ) {
+                return true;
+            }
+            return QueryUtilIndexed::uselessOr( *_org, nsd, nsd->idxNo( *id ) );
+        }
+        return QueryUtilIndexed::uselessOr( *_org, nsd, -1 );
+    }
+    
+    MultiCursor::MultiCursor( const char *ns, const BSONObj &pattern, const BSONObj &order, shared_ptr<CursorOp> op, bool mayYield )
+    : _mps( new MultiPlanScanner( ns, pattern, order, 0, true, BSONObj(), BSONObj(), !op.get(), mayYield ) ), _nscanned() {
+        if ( op.get() ) {
+            _op = op;
+        }
+        else {
+            _op.reset( new NoOp() );
+        }
+        if ( _mps->mayRunMore() ) {
+            nextClause();
+            if ( !ok() ) {
+                advance();
+            }
+        }
+        else {
+            _c.reset( new BasicCursor( DiskLoc() ) );
+        }
+    }    
+
+    MultiCursor::MultiCursor( auto_ptr<MultiPlanScanner> mps, const shared_ptr<Cursor> &c, const shared_ptr<CoveredIndexMatcher> &matcher, const QueryOp &op, long long nscanned )
+    : _op( new NoOp( op ) ), _c( c ), _mps( mps ), _matcher( matcher ), _nscanned( nscanned ) {
+        _mps->setBestGuessOnly();
+        _mps->mayYield( false ); // with a NoOp, there's no need to yield in QueryPlanSet
+        if ( !ok() ) {
+            // would have been advanced by UserQueryOp if possible
+            advance();
+        }
+    }
+    
+    void MultiCursor::nextClause() {
+        if ( _nscanned >= 0 && _c.get() ) {
+            _nscanned += _c->nscanned();
+        }
+        shared_ptr<CursorOp> best = _mps->runOpOnce( *_op );
+        if ( ! best->complete() )
+            throw MsgAssertionException( best->exception() );
+        _c = best->newCursor();
+        _matcher = best->matcher( _c );
+        _op = best;
+    }    
+    
+    bool indexWorks( const BSONObj &idxPattern, const BSONObj &sampleKey, int direction, int firstSignificantField ) {
+        BSONObjIterator p( idxPattern );
+        BSONObjIterator k( sampleKey );
+        int i = 0;
+        while( 1 ) {
+            BSONElement pe = p.next();
+            BSONElement ke = k.next();
+            if ( pe.eoo() && ke.eoo() )
+                return true;
+            if ( pe.eoo() || ke.eoo() )
+                return false;
+            if ( strcmp( pe.fieldName(), ke.fieldName() ) != 0 )
+                return false;
+            if ( ( i == firstSignificantField ) && !( ( direction > 0 ) == ( pe.number() > 0 ) ) )
+                return false;
+            ++i;
+        }
+        return false;
+    }
+
+    BSONObj extremeKeyForIndex( const BSONObj &idxPattern, int baseDirection ) {
+        BSONObjIterator i( idxPattern );
+        BSONObjBuilder b;
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            int idxDirection = e.number() >= 0 ? 1 : -1;
+            int direction = idxDirection * baseDirection;
+            switch( direction ) {
+            case 1:
+                b.appendMaxKey( e.fieldName() );
+                break;
+            case -1:
+                b.appendMinKey( e.fieldName() );
+                break;
+            default:
+                assert( false );
+            }
+        }
+        return b.obj();
+    }
+
+    pair<int,int> keyAudit( const BSONObj &min, const BSONObj &max ) {
+        int direction = 0;
+        int firstSignificantField = 0;
+        BSONObjIterator i( min );
+        BSONObjIterator a( max );
+        while( 1 ) {
+            BSONElement ie = i.next();
+            BSONElement ae = a.next();
+            if ( ie.eoo() && ae.eoo() )
+                break;
+            if ( ie.eoo() || ae.eoo() || strcmp( ie.fieldName(), ae.fieldName() ) != 0 ) {
+                return make_pair( -1, -1 );
+            }
+            int cmp = ie.woCompare( ae );
+            if ( cmp < 0 )
+                direction = 1;
+            if ( cmp > 0 )
+                direction = -1;
+            if ( direction != 0 )
+                break;
+            ++firstSignificantField;
+        }
+        return make_pair( direction, firstSignificantField );
+    }
+
+    pair<int,int> flexibleKeyAudit( const BSONObj &min, const BSONObj &max ) {
+        if ( min.isEmpty() || max.isEmpty() ) {
+            return make_pair( 1, -1 );
+        }
+        else {
+            return keyAudit( min, max );
+        }
+    }
+
+    // NOTE min, max, and keyPattern will be updated to be consistent with the selected index.
+    IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) {
+        if ( min.isEmpty() && max.isEmpty() ) {
+            errmsg = "one of min or max must be specified";
+            return 0;
+        }
+
+        Client::Context ctx( ns );
+        IndexDetails *id = 0;
+        NamespaceDetails *d = nsdetails( ns );
+        if ( !d ) {
+            errmsg = "ns not found";
+            return 0;
+        }
+
+        pair<int,int> ret = flexibleKeyAudit( min, max );
+        if ( ret == make_pair( -1, -1 ) ) {
+            errmsg = "min and max keys do not share pattern";
+            return 0;
+        }
+        if ( keyPattern.isEmpty() ) {
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                IndexDetails& ii = i.next();
+                if ( indexWorks( ii.keyPattern(), min.isEmpty() ? max : min, ret.first, ret.second ) ) {
+                    if ( ii.getSpec().getType() == 0 ) {
+                        id = &ii;
+                        keyPattern = ii.keyPattern();
+                        break;
+                    }
+                }
+            }
+
+        }
+        else {
+            if ( !indexWorks( keyPattern, min.isEmpty() ? max : min, ret.first, ret.second ) ) {
+                errmsg = "requested keyPattern does not match specified keys";
+                return 0;
+            }
+            NamespaceDetails::IndexIterator i = d->ii();
+            while( i.more() ) {
+                IndexDetails& ii = i.next();
+                if( ii.keyPattern().woCompare(keyPattern) == 0 ) {
+                    id = &ii;
+                    break;
+                }
+                if ( keyPattern.nFields() == 1 && ii.keyPattern().nFields() == 1 &&
+                        IndexDetails::isIdIndexPattern( keyPattern ) &&
+                        ii.isIdIndex() ) {
+                    id = &ii;
+                    break;
+                }
+
+            }
+        }
+
+        if ( min.isEmpty() ) {
+            min = extremeKeyForIndex( keyPattern, -1 );
+        }
+        else if ( max.isEmpty() ) {
+            max = extremeKeyForIndex( keyPattern, 1 );
+        }
+
+        if ( !id ) {
+            errmsg = str::stream() << "no index found for specified keyPattern: " << keyPattern.toString() 
+                                   << " min: " << min << " max: " << max;
+            return 0;
+        }
+
+        min = min.extractFieldsUnDotted( keyPattern );
+        max = max.extractFieldsUnDotted( keyPattern );
+
+        return id;
+    }
+    
+    bool isSimpleIdQuery( const BSONObj& query ) {
+        BSONObjIterator i(query);
+        
+        if( !i.more() ) 
+            return false;
+
+        BSONElement e = i.next();
+
+        if( i.more() ) 
+            return false;
+
+        if( strcmp("_id", e.fieldName()) != 0 ) 
+            return false;
+        
+        if ( e.isSimpleType() ) // e.g. not something like { _id : { $gt : ...
+            return true;
+
+        if ( e.type() == Object )
+            return e.Obj().firstElementFieldName()[0] != '$';
+
+        return false;
+    }
+
+    shared_ptr<Cursor> bestGuessCursor( const char *ns, const BSONObj &query, const BSONObj &sort ) {
+        if( !query.getField( "$or" ).eoo() ) {
+            return shared_ptr<Cursor>( new MultiCursor( ns, query, sort ) );
+        }
+        else {
+            auto_ptr<FieldRangeSetPair> frsp( new FieldRangeSetPair( ns, query, true ) );
+            auto_ptr<FieldRangeSetPair> origFrsp( new FieldRangeSetPair( *frsp ) );
+
+            QueryPlanSet qps( ns, frsp, origFrsp, query, sort, false );
+            QueryPlanSet::QueryPlanPtr qpp = qps.getBestGuess();
+            if( ! qpp.get() ) return shared_ptr<Cursor>();
+
+            shared_ptr<Cursor> ret = qpp->newCursor();
+
+            // If we don't already have a matcher, supply one.
+            if ( !query.isEmpty() && ! ret->matcher() ) {
+                shared_ptr<CoveredIndexMatcher> matcher( new CoveredIndexMatcher( query, ret->indexKeyPattern() ) );
+                ret->setMatcher( matcher );
+            }
+            return ret;
+        }
+    }
+
+    bool QueryUtilIndexed::indexUseful( const FieldRangeSetPair &frsp, NamespaceDetails *d, int idxNo, const BSONObj &order ) {
+        DEV frsp.assertValidIndex( d, idxNo );
+        BSONObj keyPattern = d->idx( idxNo ).keyPattern();
+        if ( !frsp.matchPossibleForIndex( d, idxNo, keyPattern ) ) {
+            // No matches are possible in the index so the index may be useful.
+            return true;   
+        }
+        return d->idx( idxNo ).getSpec().suitability( frsp.simplifiedQueryForIndex( d, idxNo, keyPattern ), order ) != USELESS;
+    }
+    
+    void QueryUtilIndexed::clearIndexesForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order ) {
+        SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+        NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( frsp.ns() );
+        nsd.registerIndexForPattern( frsp._singleKey.pattern( order ), BSONObj(), 0 );
+        nsd.registerIndexForPattern( frsp._multiKey.pattern( order ), BSONObj(), 0 );
+    }
+    
+    pair< BSONObj, long long > QueryUtilIndexed::bestIndexForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order ) {
+        SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex);
+        NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( frsp.ns() );
+        // TODO Maybe it would make sense to return the index with the lowest
+        // nscanned if there are two possibilities.
+        if ( frsp._singleKey.matchPossible() ) {
+            QueryPattern pattern = frsp._singleKey.pattern( order );
+            BSONObj oldIdx = nsd.indexForPattern( pattern );
+            if ( !oldIdx.isEmpty() ) {
+                long long oldNScanned = nsd.nScannedForPattern( pattern );
+                return make_pair( oldIdx, oldNScanned );
+            }
+        }
+        if ( frsp._multiKey.matchPossible() ) {
+            QueryPattern pattern = frsp._multiKey.pattern( order );
+            BSONObj oldIdx = nsd.indexForPattern( pattern );
+            if ( !oldIdx.isEmpty() ) {
+                long long oldNScanned = nsd.nScannedForPattern( pattern );
+                return make_pair( oldIdx, oldNScanned );
+            }
+        }
+        return make_pair( BSONObj(), 0 );
+    }
+    
+    bool QueryUtilIndexed::uselessOr( const OrRangeGenerator &org, NamespaceDetails *d, int hintIdx ) {
+        for( list<FieldRangeSetPair>::const_iterator i = org._originalOrSets.begin(); i != org._originalOrSets.end(); ++i ) {
+            if ( hintIdx != -1 ) {
+                if ( !indexUseful( *i, d, hintIdx, BSONObj() ) ) {
+                    return true;   
+                }
+            }
+            else {
+                bool useful = false;
+                for( int j = 0; j < d->nIndexes; ++j ) {
+                    if ( indexUseful( *i, d, j, BSONObj() ) ) {
+                        useful = true;
+                        break;
+                    }
+                }
+                if ( !useful ) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+    
+} // namespace mongo
diff --git a/src/mongo/db/queryoptimizer.h b/src/mongo/db/queryoptimizer.h
new file mode 100644
index 00000000000..297c6fe9505
--- /dev/null
+++ b/src/mongo/db/queryoptimizer.h
@@ -0,0 +1,599 @@
+// @file queryoptimizer.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "cursor.h"
+#include "jsobj.h"
+#include "queryutil.h"
+#include "matcher.h"
+#include "../util/net/listen.h"
+#include <queue>
+
+namespace mongo {
+
+    class IndexDetails;
+    class IndexType;
+    class ElapsedTracker;
+
+    /** A plan for executing a query using the given index spec and FieldRangeSet. */
+    class QueryPlan : boost::noncopyable {
+    public:
+
+        /**
+         * @param originalFrsp - original constraints for this query clause.  If null, frsp will be used instead.
+         */
+        QueryPlan(NamespaceDetails *d,
+                  int idxNo, // -1 = no index
+                  const FieldRangeSetPair &frsp,
+                  const FieldRangeSetPair *originalFrsp,
+                  const BSONObj &originalQuery,
+                  const BSONObj &order,
+                  bool mustAssertOnYieldFailure = true,
+                  const BSONObj &startKey = BSONObj(),
+                  const BSONObj &endKey = BSONObj(),
+                  string special="" );
+
+        /** @return true iff no other plans should be considered. */
+        bool optimal() const { return _optimal; }
+        /* @return true iff this plan should not be considered at all. */
+        bool unhelpful() const { return _unhelpful; }
+        /** @return true iff ScanAndOrder processing will be required for result set. */
+        bool scanAndOrderRequired() const { return _scanAndOrderRequired; }
+        /**
+         * @return true iff the index we are using has keys such that it can completely resolve the
+         * query expression to match by itself without ever checking the main object.
+         */
+        bool exactKeyMatch() const { return _exactKeyMatch; }
+        /** @return true iff this QueryPlan would perform an unindexed scan. */
+        bool willScanTable() const { return _idxNo < 0 && !_impossible; }
+        /** @return 'special' attribute of the plan, which was either set explicitly or generated from the index. */
+        const string &special() const { return _special; }
+                
+        /** @return a new cursor based on this QueryPlan's index and FieldRangeSet. */
+        shared_ptr<Cursor> newCursor( const DiskLoc &startLoc = DiskLoc() , int numWanted=0 ) const;
+        /** @return a new reverse cursor if this is an unindexed plan. */
+        shared_ptr<Cursor> newReverseCursor() const;
+        /** Register this plan as a winner for its QueryPattern, with specified 'nscanned'. */
+        void registerSelf( long long nScanned ) const;
+
+        int direction() const { return _direction; }
+        BSONObj indexKey() const;
+        bool indexed() const { return _index; }
+        int idxNo() const { return _idxNo; }
+        const char *ns() const { return _frs.ns(); }
+        NamespaceDetails *nsd() const { return _d; }
+        BSONObj originalQuery() const { return _originalQuery; }
+        BSONObj simplifiedQuery( const BSONObj& fields = BSONObj() ) const { return _frs.simplifiedQuery( fields ); }
+        const FieldRange &range( const char *fieldName ) const { return _frs.range( fieldName ); }
+        shared_ptr<FieldRangeVector> originalFrv() const { return _originalFrv; }
+
+        const FieldRangeSet &multikeyFrs() const { return _frsMulti; }
+        
+        bool mustAssertOnYieldFailure() const { return _mustAssertOnYieldFailure; }
+        
+        /** The following member functions are just for testing. */
+        
+        shared_ptr<FieldRangeVector> frv() const { return _frv; }
+        bool isMultiKey() const;
+        
+    private:
+        NamespaceDetails * _d;
+        int _idxNo;
+        const FieldRangeSet &_frs;
+        const FieldRangeSet &_frsMulti;
+        const BSONObj &_originalQuery;
+        const BSONObj &_order;
+        const IndexDetails * _index;
+        bool _optimal;
+        bool _scanAndOrderRequired;
+        bool _exactKeyMatch;
+        int _direction;
+        shared_ptr<FieldRangeVector> _frv;
+        shared_ptr<FieldRangeVector> _originalFrv;
+        BSONObj _startKey;
+        BSONObj _endKey;
+        bool _endKeyInclusive;
+        bool _unhelpful;
+        bool _impossible;
+        string _special;
+        IndexType * _type;
+        bool _startOrEndSpec;
+        bool _mustAssertOnYieldFailure;
+    };
+
+    /**
+     * Inherit from this interface to implement a new query operation.
+     * The query optimizer will clone the QueryOp that is provided, giving
+     * each clone its own query plan.
+     *
+     * Normal sequence of events:
+     * 1) A new QueryOp is generated using createChild().
+     * 2) A QueryPlan is assigned to this QueryOp with setQueryPlan().
+     * 3) _init() is called on the QueryPlan.
+     * 4) next() is called repeatedly, with nscanned() checked after each call.
+     * 5) In one of these calls to next(), setComplete() is called.
+     * 6) The QueryPattern for the QueryPlan may be recorded as a winner.
+     */
+    class QueryOp {
+    public:
+        QueryOp() : _complete(), _stopRequested(), _qp(), _error() {}
+
+        /** Used when handing off from one QueryOp to another. */
+        QueryOp( const QueryOp &other ) :
+            _complete(), _stopRequested(), _qp(), _error(), _matcher( other._matcher ),
+            _orConstraint( other._orConstraint ) {}
+
+        virtual ~QueryOp() {}
+
+        /** @return QueryPlan assigned to this QueryOp by the query optimizer. */
+        const QueryPlan &qp() const { return *_qp; }
+                
+        /** Advance to next potential matching document (eg using a cursor). */
+        virtual void next() = 0;
+        /**
+         * @return current 'nscanned' metric for this QueryOp.  Used to compare
+         * cost to other QueryOps.
+         */
+        virtual long long nscanned() = 0;
+        /** Take any steps necessary before the db mutex is yielded. */
+        virtual bool prepareToYield() { massert( 13335, "yield not supported", false ); return false; }
+        /** Recover once the db mutex is regained. */
+        virtual void recoverFromYield() { massert( 13336, "yield not supported", false ); }
+        
+        /**
+         * @return true iff the QueryPlan for this QueryOp may be registered
+         * as a winning plan.
+         */
+        virtual bool mayRecordPlan() const = 0;
+
+        /** @return true iff the implementation called setComplete() or setStop(). */
+        bool complete() const { return _complete; }
+        /** @return true iff the implementation called steStop(). */
+        bool stopRequested() const { return _stopRequested; }
+        /** @return true iff the implementation threw an exception. */
+        bool error() const { return _error; }
+        /** @return the exception thrown by implementation if one was thrown. */
+        ExceptionInfo exception() const { return _exception; }
+        
+        /** To be called by QueryPlanSet::Runner only. */
+        
+        QueryOp *createChild();
+        void setQueryPlan( const QueryPlan *qp ) { _qp = qp; assert( _qp != NULL ); }
+        void init();        
+        void setException( const DBException &e ) {
+            _error = true;
+            _exception = e.getInfo();
+        }
+
+        shared_ptr<CoveredIndexMatcher> matcher( const shared_ptr<Cursor>& c ) const {
+           return matcher( c.get() );
+        }
+        shared_ptr<CoveredIndexMatcher> matcher( Cursor* c ) const {
+            if( ! c ) return _matcher;
+            return c->matcher() ? c->matcherPtr() : _matcher;
+        }
+        
+    protected:
+        /** Call if all results have been found. */
+        void setComplete() {
+            _orConstraint = qp().originalFrv();
+            _complete = true;
+        }
+        /** Call if the scan is complete even if not all results have been found. */
+        void setStop() { setComplete(); _stopRequested = true; }
+
+        /** Handle initialization after a QueryPlan has been set. */
+        virtual void _init() = 0;
+
+        /** @return a copy of the inheriting class, which will be run with its own query plan. */
+        virtual QueryOp *_createChild() const = 0;
+
+        virtual bool alwaysUseRecord() const { return false; }
+
+    private:
+        bool _complete;
+        bool _stopRequested;
+        ExceptionInfo _exception;
+        const QueryPlan *_qp;
+        bool _error;
+        shared_ptr<CoveredIndexMatcher> _matcher;
+        shared_ptr<CoveredIndexMatcher> _oldMatcher;
+        shared_ptr<FieldRangeVector> _orConstraint;
+    };
+
+    // temp.  this class works if T::operator< is variant unlike a regular stl priority queue.
+    // but it's very slow.  however if v.size() is always very small, it would be fine, 
+    // maybe even faster than a smart impl that does more memory allocations.
+    template<class T>
+    class our_priority_queue : boost::noncopyable { 
+        vector<T> v;
+    public:
+        our_priority_queue() { 
+            v.reserve(4);
+        }
+        int size() const { return v.size(); }
+        bool empty() const { return v.empty(); }
+        void push(const T & x) { 
+            v.push_back(x); 
+        }
+        T pop() { 
+            size_t t = 0;
+            for( size_t i = 1; i < v.size(); i++ ) { 
+                if( v[t] < v[i] )
+                    t = i;
+            }
+            T ret = v[t];
+            v.erase(v.begin()+t);
+            return ret;
+        }
+    };
+
+    /**
+     * A set of candidate query plans for a query.  This class can return a best buess plan or run a
+     * QueryOp on all the plans.
+     */
+    class QueryPlanSet {
+    public:
+
+        typedef boost::shared_ptr<QueryPlan> QueryPlanPtr;
+        typedef vector<QueryPlanPtr> PlanSet;
+
+        /**
+         * @param originalFrsp - original constraints for this query clause; if null, frsp will be used.
+         */
+        QueryPlanSet( const char *ns,
+                      auto_ptr<FieldRangeSetPair> frsp,
+                      auto_ptr<FieldRangeSetPair> originalFrsp,
+                      const BSONObj &originalQuery,
+                      const BSONObj &order,
+                      bool mustAssertOnYieldFailure = true,
+                      const BSONElement *hint = 0,
+                      bool honorRecordedPlan = true,
+                      const BSONObj &min = BSONObj(),
+                      const BSONObj &max = BSONObj(),
+                      bool bestGuessOnly = false,
+                      bool mayYield = false);
+
+        /** @return number of candidate plans. */
+        int nPlans() const { return _plans.size(); }
+
+        /**
+         * Clone op for each query plan, and @return the first cloned op to call
+         * setComplete() or setStop().
+         */
+
+        shared_ptr<QueryOp> runOp( QueryOp &op );
+        template<class T>
+        shared_ptr<T> runOp( T &op ) {
+            return dynamic_pointer_cast<T>( runOp( static_cast<QueryOp&>( op ) ) );
+        }
+
+        /** Initialize or iterate a runner generated from @param originalOp. */
+        shared_ptr<QueryOp> nextOp( QueryOp &originalOp, bool retried = false );
+        
+        /** Yield the runner member. */
+        
+        bool prepareToYield();
+        void recoverFromYield();
+        
+        /** Clear the runner member. */
+        void clearRunner();
+        
+        QueryPlanPtr firstPlan() const { return _plans[ 0 ]; }
+        
+        /** @return metadata about cursors and index bounds for all plans, suitable for explain output. */
+        BSONObj explain() const;
+        /** @return true iff a plan is selected based on previous success of this plan. */
+        bool usingCachedPlan() const { return _usingCachedPlan; }
+        /** @return a single plan that may work well for the specified query. */
+        QueryPlanPtr getBestGuess() const;
+
+        //for testing
+        const FieldRangeSetPair &frsp() const { return *_frsp; }
+        const FieldRangeSetPair *originalFrsp() const { return _originalFrsp.get(); }
+        bool modifiedKeys() const;
+        bool hasMultiKey() const;
+
+    private:
+        void addOtherPlans( bool checkFirst );
+        void addPlan( QueryPlanPtr plan, bool checkFirst ) {
+            if ( checkFirst && plan->indexKey().woCompare( _plans[ 0 ]->indexKey() ) == 0 )
+                return;
+            _plans.push_back( plan );
+        }
+        void init();
+        void addHint( IndexDetails &id );
+        class Runner {
+        public:
+            Runner( QueryPlanSet &plans, QueryOp &op );
+
+            /**
+             * Iterate interactively through candidate documents on all plans.
+             * QueryOp objects are returned at each interleaved step.
+             */
+            
+            /** @return a plan that has completed, otherwise an arbitrary plan. */
+            shared_ptr<QueryOp> init();
+            /**
+             * Move the Runner forward one iteration, and @return the plan for
+             * this iteration.
+             */
+            shared_ptr<QueryOp> next();
+            /** @return next non error op if there is one, otherwise an error op. */
+            shared_ptr<QueryOp> nextNonError();
+
+            bool prepareToYield();
+            void recoverFromYield();
+            
+            /** Run until first op completes. */
+            shared_ptr<QueryOp> runUntilFirstCompletes();
+             
+            void mayYield();
+            QueryOp &_op;
+            QueryPlanSet &_plans;
+            static void initOp( QueryOp &op );
+            static void nextOp( QueryOp &op );
+            static bool prepareToYieldOp( QueryOp &op );
+            static void recoverFromYieldOp( QueryOp &op );
+        private:
+            vector<shared_ptr<QueryOp> > _ops;
+            struct OpHolder {
+                OpHolder( const shared_ptr<QueryOp> &op ) : _op( op ), _offset() {}
+                shared_ptr<QueryOp> _op;
+                long long _offset;
+                bool operator<( const OpHolder &other ) const {
+                    return _op->nscanned() + _offset > other._op->nscanned() + other._offset;
+                }
+            };
+            our_priority_queue<OpHolder> _queue;
+        };
+
+        const char *_ns;
+        BSONObj _originalQuery;
+        auto_ptr<FieldRangeSetPair> _frsp;
+        auto_ptr<FieldRangeSetPair> _originalFrsp;
+        PlanSet _plans;
+        bool _mayRecordPlan;
+        bool _usingCachedPlan;
+        BSONObj _hint;
+        BSONObj _order;
+        long long _oldNScanned;
+        bool _honorRecordedPlan;
+        BSONObj _min;
+        BSONObj _max;
+        string _special;
+        bool _bestGuessOnly;
+        bool _mayYield;
+        ElapsedTracker _yieldSometimesTracker;
+        shared_ptr<Runner> _runner;
+        bool _mustAssertOnYieldFailure;
+    };
+
+    /** Handles $or type queries by generating a QueryPlanSet for each $or clause. */
+    class MultiPlanScanner {
+    public:
+        MultiPlanScanner( const char *ns,
+                          const BSONObj &query,
+                          const BSONObj &order,
+                          const BSONElement *hint = 0,
+                          bool honorRecordedPlan = true,
+                          const BSONObj &min = BSONObj(),
+                          const BSONObj &max = BSONObj(),
+                          bool bestGuessOnly = false,
+                          bool mayYield = false);
+
+        /**
+         * Clone op for each query plan of a single $or clause, and @return the first cloned op
+         * to call setComplete() or setStop().
+         */
+
+        shared_ptr<QueryOp> runOpOnce( QueryOp &op );
+        template<class T>
+        shared_ptr<T> runOpOnce( T &op ) {
+            return dynamic_pointer_cast<T>( runOpOnce( static_cast<QueryOp&>( op ) ) );
+        }
+
+        /**
+         * For each $or clause, calls runOpOnce on the child QueryOp cloned from the winning QueryOp
+         * of the previous $or clause (or from the supplied 'op' for the first $or clause).
+         */
+
+        shared_ptr<QueryOp> runOp( QueryOp &op );
+        template<class T>
+        shared_ptr<T> runOp( T &op ) {
+            return dynamic_pointer_cast<T>( runOp( static_cast<QueryOp&>( op ) ) );
+        }
+
+        /** Initialize or iterate a runner generated from @param originalOp. */
+        
+        void initialOp( const shared_ptr<QueryOp> &originalOp ) { _baseOp = originalOp; }
+        shared_ptr<QueryOp> nextOp();
+        
+        /** Yield the runner member. */
+        
+        bool prepareToYield();
+        void recoverFromYield();
+        
+        /** Clear the runner member. */
+        void clearRunner();
+        
+        int currentNPlans() const;
+
+        /**
+         * @return a single simple cursor if the scanner would run a single cursor
+         * for this query, otherwise return an empty shared_ptr.
+         */
+        shared_ptr<Cursor> singleCursor() const;
+
+        /**
+         * @return the query plan that would be used if the scanner would run a single
+         * cursor for this query, otherwise 0.  The returned plan is invalid if this
+         * MultiPlanScanner is destroyed, hence we return a raw pointer.
+         */
+        const QueryPlan *singlePlan() const;
+
+        /** @return true iff more $or clauses need to be scanned. */
+        bool mayRunMore() const { return _or ? ( !_tableScanned && !_org->orFinished() ) : _i == 0; }
+        /** @return non-$or version of explain output. */
+        BSONObj oldExplain() const { assertNotOr(); return _currentQps->explain(); }
+        /** @return true iff this is not a $or query and a plan is selected based on previous success of this plan. */
+        bool usingCachedPlan() const { return !_or && _currentQps->usingCachedPlan(); }
+        /** Don't attempt to scan multiple plans, just use the best guess. */
+        void setBestGuessOnly() { _bestGuessOnly = true; }
+        /** Yielding is allowed while running each QueryPlan. */
+        void mayYield( bool val ) { _mayYield = val; }
+        bool modifiedKeys() const { return _currentQps->modifiedKeys(); }
+        bool hasMultiKey() const { return _currentQps->hasMultiKey(); }
+
+    private:
+        void assertNotOr() const {
+            massert( 13266, "not implemented for $or query", !_or );
+        }
+        void assertMayRunMore() const {
+            massert( 13271, "can't run more ops", mayRunMore() );
+        }
+        shared_ptr<QueryOp> nextOpBeginningClause();
+        shared_ptr<QueryOp> nextOpHandleEndOfClause();
+        bool uselessOr( const BSONElement &hint ) const;
+        const char * _ns;
+        bool _or;
+        BSONObj _query;
+        shared_ptr<OrRangeGenerator> _org; // May be null in certain non $or query cases.
+        auto_ptr<QueryPlanSet> _currentQps;
+        int _i;
+        bool _honorRecordedPlan;
+        bool _bestGuessOnly;
+        BSONObj _hint;
+        bool _mayYield;
+        bool _tableScanned;
+        shared_ptr<QueryOp> _baseOp;
+    };
+
+    /** Provides a cursor interface for certain limited uses of a MultiPlanScanner. */
+    class MultiCursor : public Cursor {
+    public:
+        class CursorOp : public QueryOp {
+        public:
+            CursorOp() {}
+            CursorOp( const QueryOp &other ) : QueryOp( other ) {}
+            virtual shared_ptr<Cursor> newCursor() const = 0;
+        };
+        /** takes ownership of 'op' */
+        MultiCursor( const char *ns, const BSONObj &pattern, const BSONObj &order, shared_ptr<CursorOp> op = shared_ptr<CursorOp>(), bool mayYield = false );
+        /**
+         * Used
+         * 1. To handoff a query to a getMore()
+         * 2. To handoff a QueryOptimizerCursor
+         * @param nscanned is an optional initial value, if not supplied nscanned()
+         * will always return -1
+         */
+        MultiCursor( auto_ptr<MultiPlanScanner> mps, const shared_ptr<Cursor> &c, const shared_ptr<CoveredIndexMatcher> &matcher, const QueryOp &op, long long nscanned = -1 );
+
+        virtual bool ok() { return _c->ok(); }
+        virtual Record* _current() { return _c->_current(); }
+        virtual BSONObj current() { return _c->current(); }
+        virtual DiskLoc currLoc() { return _c->currLoc(); }
+        virtual bool advance() {
+            _c->advance();
+            while( !ok() && _mps->mayRunMore() ) {
+                nextClause();
+            }
+            return ok();
+        }
+        virtual BSONObj currKey() const { return _c->currKey(); }
+        virtual DiskLoc refLoc() { return _c->refLoc(); }
+        virtual void noteLocation() { _c->noteLocation(); }
+        virtual void checkLocation() { _c->checkLocation(); }
+        virtual bool supportGetMore() { return true; }
+        virtual bool supportYields() { return _c->supportYields(); }
+        virtual BSONObj indexKeyPattern() { return _c->indexKeyPattern(); }
+
+        /**
+         * with update we could potentially get the same document on multiple
+         * indexes, but update appears to already handle this with seenObjects
+         * so we don't have to do anything special here.
+         */
+        virtual bool getsetdup(DiskLoc loc) { return _c->getsetdup( loc ); }
+
+        virtual bool autoDedup() const { return _c->autoDedup(); }
+
+        virtual bool modifiedKeys() const { return _mps->modifiedKeys(); }
+
+        virtual bool isMultiKey() const { return _mps->hasMultiKey(); }
+
+        virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; }
+        virtual CoveredIndexMatcher* matcher() const { return _matcher.get(); }
+
+        virtual bool capped() const { return _c->capped(); }
+
+        /** return -1 if we're a getmore handoff */
+        virtual long long nscanned() { return _nscanned >= 0 ? _nscanned + _c->nscanned() : _nscanned; }
+        /** just for testing */
+        shared_ptr<Cursor> sub_c() const { return _c; }
+    private:
+        class NoOp : public CursorOp {
+        public:
+            NoOp() {}
+            NoOp( const QueryOp &other ) : CursorOp( other ) {}
+            virtual void _init() { setComplete(); }
+            virtual void next() {}
+            virtual bool mayRecordPlan() const { return false; }
+            virtual QueryOp *_createChild() const { return new NoOp(); }
+            virtual shared_ptr<Cursor> newCursor() const { return qp().newCursor(); }
+            virtual long long nscanned() { assert( false ); return 0; }
+        };
+        void nextClause();
+        shared_ptr<CursorOp> _op;
+        shared_ptr<Cursor> _c;
+        auto_ptr<MultiPlanScanner> _mps;
+        shared_ptr<CoveredIndexMatcher> _matcher;
+        long long _nscanned;
+    };
+
+    /** NOTE min, max, and keyPattern will be updated to be consistent with the selected index. */
+    IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern );
+
+    bool isSimpleIdQuery( const BSONObj& query );
+
+    /**
+     * @return a single cursor that may work well for the given query.
+     * It is possible no cursor is returned if the sort is not supported by an index.  Clients are responsible
+     * for checking this if they are not sure an index for a sort exists, and defaulting to a non-sort if
+     * no suitable indices exist.
+     */
+    shared_ptr<Cursor> bestGuessCursor( const char *ns, const BSONObj &query, const BSONObj &sort );
+
+    /**
+     * Add-on functionality for queryutil classes requiring access to indexing
+     * functionality not currently linked to mongos.
+     * TODO Clean this up a bit, possibly with separate sharded and non sharded
+     * implementations for the appropriate queryutil classes or by pulling index
+     * related functionality into separate wrapper classes.
+     */
+    struct QueryUtilIndexed {
+        /** @return true if the index may be useful according to its KeySpec. */
+        static bool indexUseful( const FieldRangeSetPair &frsp, NamespaceDetails *d, int idxNo, const BSONObj &order );
+        /** Clear any indexes recorded as the best for either the single or multi key pattern. */
+        static void clearIndexesForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order );
+        /** Return a recorded best index for the single or multi key pattern. */
+        static pair< BSONObj, long long > bestIndexForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order );        
+        static bool uselessOr( const OrRangeGenerator& org, NamespaceDetails *d, int hintIdx );
+    };
+    
+} // namespace mongo
diff --git a/src/mongo/db/queryoptimizercursor.cpp b/src/mongo/db/queryoptimizercursor.cpp
new file mode 100644
index 00000000000..07f8df12815
--- /dev/null
+++ b/src/mongo/db/queryoptimizercursor.cpp
@@ -0,0 +1,530 @@
+// @file queryoptimizercursor.cpp
+
+/**
+ *    Copyright (C) 2011 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "queryoptimizer.h"
+#include "pdfile.h"
+#include "clientcursor.h"
+#include "btree.h"
+#include "queryoptimizercursor.h"
+
+namespace mongo {
+    
+    static const int OutOfOrderDocumentsAssertionCode = 14810;
+    
+    /**
+     * A QueryOp implementation utilized by the QueryOptimizerCursor
+     */
+    class QueryOptimizerCursorOp : public QueryOp {
+    public:
+        /**
+         * @param aggregateNscanned - shared long long counting total nscanned for
+         * query ops for all cursors.
+         * @param requireIndex - if unindexed scans should be prohibited.
+         */
+        QueryOptimizerCursorOp( long long &aggregateNscanned, bool requireIndex, int cumulativeCount = 0 ) : _matchCounter( aggregateNscanned, cumulativeCount ), _countingMatches(), _mustAdvance(), _capped(), _yieldRecoveryFailed(), _requireIndex( requireIndex ) {}
+        
+        virtual void _init() {
+            if ( qp().scanAndOrderRequired() ) {
+                throw MsgAssertionException( OutOfOrderDocumentsAssertionCode, "order spec cannot be satisfied with index" );
+            }
+            if ( _requireIndex && strcmp( qp().indexKey().firstElementFieldName(), "$natural" ) == 0 ) {
+                throw MsgAssertionException( 9011, "Not an index cursor" );                
+            }
+            _c = qp().newCursor();
+
+            // The QueryOptimizerCursor::prepareToTouchEarlierIterate() implementation requires _c->prepareToYield() to work.
+            verify( 15940, _c->supportYields() );
+            _capped = _c->capped();
+
+            // TODO This violates the current Cursor interface abstraction, but for now it's simpler to keep our own set of
+            // dups rather than avoid poisoning the cursor's dup set with unreturned documents.  Deduping documents
+            // matched in this QueryOptimizerCursorOp will run against the takeover cursor.
+            _matchCounter.setCheckDups( _c->isMultiKey() );
+
+            _matchCounter.updateNscanned( _c->nscanned() );
+        }
+        
+        virtual long long nscanned() {
+            return _c ? _c->nscanned() : _matchCounter.nscanned();
+        }
+        
+        virtual bool prepareToYield() {
+            if ( _c && !_cc ) {
+                _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , qp().ns() ) );
+            }
+            if ( _cc ) {
+                recordCursorLocation();
+                return _cc->prepareToYield( _yieldData );
+            }
+            // no active cursor - ok to yield
+            return true;
+        }
+        
+        virtual void recoverFromYield() {
+            if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) {
+                _yieldRecoveryFailed = true;
+                _c.reset();
+                _cc.reset();
+                
+                if ( _capped ) {
+                    msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun: " << qp().ns() );
+                }
+                else if ( qp().mustAssertOnYieldFailure() ) {
+                    msgassertedNoTrace( 15892, str::stream() << "QueryOptimizerCursorOp::recoverFromYield() failed to recover" );
+                }
+                else {
+                    // we don't fail query since we're fine with returning partial data if collection dropped
+                    // also, see SERVER-2454
+                }
+            }
+            else {
+                checkCursorAdvanced();
+            }
+        }
+
+        void prepareToTouchEarlierIterate() {
+            recordCursorLocation();
+            if ( _c ) {
+                _c->prepareToTouchEarlierIterate();
+            }
+        }
+
+        void recoverFromTouchingEarlierIterate() {
+            if ( _c ) {
+                _c->recoverFromTouchingEarlierIterate();
+            }
+            checkCursorAdvanced();
+        }
+        
+        virtual void next() {
+            mayAdvance();
+            
+            if ( _matchCounter.enoughCumulativeMatchesToChooseAPlan() ) {
+                setStop();
+                return;
+            }
+            if ( !_c || !_c->ok() ) {
+                setComplete();
+                return;
+            }
+            
+            _mustAdvance = true;
+        }
+        virtual QueryOp *_createChild() const {
+            return new QueryOptimizerCursorOp( _matchCounter.aggregateNscanned(), _requireIndex, _matchCounter.cumulativeCount() );
+        }
+        DiskLoc currLoc() const { return _c ? _c->currLoc() : DiskLoc(); }
+        BSONObj currKey() const { return _c ? _c->currKey() : BSONObj(); }
+        bool currentMatches( MatchDetails *details ) {
+            bool ret = ( _c && _c->ok() ) ? matcher( _c.get() )->matchesCurrent( _c.get(), details ) : false;
+            // Cache the match, so we can count it in mayAdvance().
+            _matchCounter.setMatch( ret );
+            return ret;
+        }
+        virtual bool mayRecordPlan() const {
+            return !_yieldRecoveryFailed && complete() && ( !stopRequested() || _matchCounter.enoughMatchesToRecordPlan() );
+        }
+        shared_ptr<Cursor> cursor() const { return _c; }
+    private:
+        void mayAdvance() {
+            if ( !_c ) {
+                return;
+            }
+            if ( countingMatches() ) {
+                // Check match if not yet known.
+                if ( !_matchCounter.knowMatch() ) {
+                    currentMatches( 0 );
+                }
+                _matchCounter.countMatch( currLoc() );
+            }
+            if ( _mustAdvance ) {
+                _c->advance();
+                handleCursorAdvanced();
+            }
+            _matchCounter.updateNscanned( _c->nscanned() );
+        }
+        // Don't count matches on the first call to next(), which occurs before the first result is returned.
+        bool countingMatches() {
+            if ( _countingMatches ) {
+                return true;
+            }
+            _countingMatches = true;
+            return false;
+        }
+
+        void recordCursorLocation() {
+            _posBeforeYield = currLoc();
+        }
+        void checkCursorAdvanced() {
+            // This check will not correctly determine if we are looking at a different document in
+            // all cases, but it is adequate for updating the query plan's match count (just used to pick
+            // plans, not returned to the client) and adjust iteration via _mustAdvance.
+            if ( _posBeforeYield != currLoc() ) {
+                // If the yield advanced our position, the next next() will be a no op.
+                handleCursorAdvanced();
+            }
+        }
+        void handleCursorAdvanced() {
+            _mustAdvance = false;
+            _matchCounter.resetMatch();
+        }
+
+        CachedMatchCounter _matchCounter;
+        bool _countingMatches;
+        bool _mustAdvance;
+        bool _capped;
+        shared_ptr<Cursor> _c;
+        ClientCursor::CleanupPointer _cc;
+        DiskLoc _posBeforeYield;
+        ClientCursor::YieldData _yieldData;
+        bool _yieldRecoveryFailed;
+        bool _requireIndex;
+    };
+    
+    /**
+     * This cursor runs a MultiPlanScanner iteratively and returns results from
+     * the scanner's cursors as they become available.  Once the scanner chooses
+     * a single plan, this cursor becomes a simple wrapper around that single
+     * plan's cursor (called the 'takeover' cursor).
+     */
+    class QueryOptimizerCursor : public Cursor {
+    public:
+        QueryOptimizerCursor( auto_ptr<MultiPlanScanner> &mps, bool requireIndex ) :
+        _mps( mps ),
+        _originalOp( new QueryOptimizerCursorOp( _nscanned, requireIndex ) ),
+        _currOp(),
+        _nscanned() {
+            _mps->initialOp( _originalOp );
+            shared_ptr<QueryOp> op = _mps->nextOp();
+            rethrowOnError( op );
+            if ( !op->complete() ) {
+                _currOp = dynamic_cast<QueryOptimizerCursorOp*>( op.get() );
+            }
+        }
+        
+        virtual bool ok() { return _takeover ? _takeover->ok() : !currLoc().isNull(); }
+        
+        virtual Record* _current() {
+            if ( _takeover ) {
+                return _takeover->_current();
+            }
+            assertOk();
+            return currLoc().rec();
+        }
+        
+        virtual BSONObj current() {
+            if ( _takeover ) {
+                return _takeover->current();
+            }
+            assertOk();
+            return currLoc().obj();
+        }
+        
+        virtual DiskLoc currLoc() { return _takeover ? _takeover->currLoc() : _currLoc(); }
+        
+        DiskLoc _currLoc() const {
+            dassert( !_takeover );
+            return _currOp ? _currOp->currLoc() : DiskLoc();
+        }
+        
+        virtual bool advance() {
+            return _advance( false );
+        }
+        
+        virtual BSONObj currKey() const {
+            if ( _takeover ) {
+             	return _takeover->currKey();   
+            }
+            assertOk();
+            return _currOp->currKey();
+        }
+        
+        /**
+         * When return value isNull(), our cursor will be ignored for yielding by the client cursor implementation.
+         * In such cases, an internal ClientCursor will update the position of component cursors when necessary.
+         */
+        virtual DiskLoc refLoc() { return _takeover ? _takeover->refLoc() : DiskLoc(); }
+        
+        virtual BSONObj indexKeyPattern() {
+            if ( _takeover ) {
+                return _takeover->indexKeyPattern();
+            }
+            assertOk();
+            return _currOp->cursor()->indexKeyPattern();
+        }
+        
+        virtual bool supportGetMore() { return false; }
+
+        virtual bool supportYields() { return _takeover ? _takeover->supportYields() : true; }
+        
+        virtual void prepareToTouchEarlierIterate() {
+            if ( _takeover ) {
+                _takeover->prepareToTouchEarlierIterate();
+            }
+            else if ( _currOp ) {
+                if ( _mps->currentNPlans() == 1 ) {
+                    // This single plan version is a bit more performant, so we use it when possible.
+                    _currOp->prepareToTouchEarlierIterate();
+                }
+                else {
+                    // With multiple plans, the 'earlier iterate' could be the current iterate of one of
+                    // the component plans.  We do a full yield of all plans, using ClientCursors.
+                    verify( 15941, _mps->prepareToYield() );
+                }
+            }
+        }
+
+        virtual void recoverFromTouchingEarlierIterate() {
+            if ( _takeover ) {
+                _takeover->recoverFromTouchingEarlierIterate();
+            }
+            else if ( _currOp ) {
+                if ( _mps->currentNPlans() == 1 ) {
+                    _currOp->recoverFromTouchingEarlierIterate();
+                }
+                else {
+                    recoverFromYield();
+                }
+            }
+        }
+
+        virtual bool prepareToYield() {
+            if ( _takeover ) {
+                return _takeover->prepareToYield();
+            }
+            else if ( _currOp ) {
+                return _mps->prepareToYield();
+            }
+            else {
+                // No state needs to be protected, so yielding is fine.
+                return true;
+            }
+        }
+        
+        virtual void recoverFromYield() {
+            if ( _takeover ) {
+                _takeover->recoverFromYield();
+                return;
+            }
+            if ( _currOp ) {
+                _mps->recoverFromYield();
+                if ( _currOp->error() || !ok() ) {
+                    // Advance to a non error op if on of the ops errored out.
+                    // Advance to a following $or clause if the $or clause returned all results.
+                    _advance( true );
+                }
+            }
+        }
+        
+        virtual string toString() { return "QueryOptimizerCursor"; }
+        
+        virtual bool getsetdup(DiskLoc loc) {
+            if ( _takeover ) {
+                if ( getdupInternal( loc ) ) {
+                    return true;   
+                }
+             	return _takeover->getsetdup( loc );   
+            }
+            assertOk();
+            return getsetdupInternal( loc );                
+        }
+        
+        /** Matcher needs to know if the the cursor being forwarded to is multikey. */
+        virtual bool isMultiKey() const {
+            if ( _takeover ) {
+                return _takeover->isMultiKey();
+            }
+            assertOk();
+            return _currOp->cursor()->isMultiKey();
+        }
+        
+        virtual bool modifiedKeys() const { return true; }
+
+        /** Initial capped wrapping cases (before takeover) are handled internally by a component ClientCursor. */
+        virtual bool capped() const { return _takeover ? _takeover->capped() : false; }
+
+        virtual long long nscanned() { return _takeover ? _takeover->nscanned() : _nscanned; }
+
+        virtual shared_ptr<CoveredIndexMatcher> matcherPtr() const {
+            if ( _takeover ) {
+                return _takeover->matcherPtr();
+            }
+            assertOk();
+            return _currOp->matcher( _currOp->cursor() );
+        }
+
+        virtual CoveredIndexMatcher* matcher() const {
+            if ( _takeover ) {
+                return _takeover->matcher();
+            }
+            assertOk();
+            return _currOp->matcher( _currOp->cursor() ).get();
+        }
+
+        virtual bool currentMatches( MatchDetails *details = 0 ) {
+            if ( _takeover ) {
+                return _takeover->currentMatches( details );
+            }
+            assertOk();
+            return _currOp->currentMatches( details );
+        }
+
+    private:
+        /**
+         * Advances the QueryPlanSet::Runner.
+         * @param force - advance even if the current query op is not valid.  The 'force' param should only be specified
+         * when there are plans left in the runner.
+         */
+        bool _advance( bool force ) {
+            if ( _takeover ) {
+                return _takeover->advance();
+            }
+
+            if ( !force && !ok() ) {
+                return false;
+            }
+
+            DiskLoc prevLoc = _currLoc();
+
+            _currOp = 0;
+            shared_ptr<QueryOp> op = _mps->nextOp();
+            rethrowOnError( op );
+
+            // Avoiding dynamic_cast here for performance.  Soon we won't need to
+            // do a cast at all.
+            QueryOptimizerCursorOp *qocop = (QueryOptimizerCursorOp*)( op.get() );
+
+            if ( !op->complete() ) {
+                // The 'qocop' will be valid until we call _mps->nextOp() again.  We return 'current' values from this op.
+                _currOp = qocop;
+            }
+            else if ( op->stopRequested() ) {
+                if ( qocop->cursor() ) {
+                    // Ensure that prepareToTouchEarlierIterate() may be called safely when a BasicCursor takes over.
+                    if ( !prevLoc.isNull() && prevLoc == qocop->currLoc() ) {
+                        qocop->cursor()->advance();
+                    }
+                    // Clear the Runner and any unnecessary QueryOps and their ClientCursors.
+                    _mps->clearRunner();
+                    _takeover.reset( new MultiCursor( _mps,
+                                                     qocop->cursor(),
+                                                     op->matcher( qocop->cursor() ),
+                                                     *op,
+                                                     _nscanned - qocop->cursor()->nscanned() ) );
+                }
+            }
+
+            return ok();
+        }
+        /** Forward an exception when the runner errs out. */
+        void rethrowOnError( const shared_ptr< QueryOp > &op ) {
+            if ( op->error() ) {
+                throw MsgAssertionException( op->exception() );   
+            }
+        }
+        
+        void assertOk() const {
+            massert( 14809, "Invalid access for cursor that is not ok()", !_currLoc().isNull() );
+        }
+
+        /** Insert and check for dups before takeover occurs */
+        bool getsetdupInternal(const DiskLoc &loc) {
+            return _dups.getsetdup( loc );
+        }
+
+        /** Just check for dups - after takeover occurs */
+        bool getdupInternal(const DiskLoc &loc) {
+            dassert( _takeover );
+            return _dups.getdup( loc );
+        }
+        
+        auto_ptr<MultiPlanScanner> _mps;
+        shared_ptr<QueryOptimizerCursorOp> _originalOp;
+        QueryOptimizerCursorOp *_currOp;
+        shared_ptr<Cursor> _takeover;
+        long long _nscanned;
+        // Using a SmallDupSet seems a bit hokey, but I've measured a 5% performance improvement with ~100 document non multi key scans.
+        SmallDupSet _dups;
+    };
+    
+    shared_ptr<Cursor> newQueryOptimizerCursor( auto_ptr<MultiPlanScanner> mps, bool requireIndex ) {
+        try {
+            return shared_ptr<Cursor>( new QueryOptimizerCursor( mps, requireIndex ) );
+        } catch( const AssertionException &e ) {
+            if ( e.getCode() == OutOfOrderDocumentsAssertionCode ) {
+                // If no indexes follow the requested sort order, return an
+                // empty pointer.  This is legacy behavior based on bestGuessCursor().
+                return shared_ptr<Cursor>();
+            }
+            throw;
+        }
+        return shared_ptr<Cursor>();
+    }
+    
+    shared_ptr<Cursor> NamespaceDetailsTransient::getCursor( const char *ns, const BSONObj &query,
+                                                            const BSONObj &order, bool requireIndex,
+                                                            bool *simpleEqualityMatch ) {
+        if ( simpleEqualityMatch ) {
+            *simpleEqualityMatch = false;
+        }
+        if ( query.isEmpty() && order.isEmpty() && !requireIndex ) {
+            // TODO This will not use a covered index currently.
+            return theDataFileMgr.findAll( ns );
+        }
+        if ( isSimpleIdQuery( query ) ) {
+            Database *database = cc().database();
+            verify( 15985, database );
+            NamespaceDetails *d = database->namespaceIndex.details(ns);
+            if ( d ) {
+                int idxNo = d->findIdIndex();
+                if ( idxNo >= 0 ) {
+                    IndexDetails& i = d->idx( idxNo );
+                    BSONObj key = i.getKeyFromQuery( query );
+                    return shared_ptr<Cursor>( BtreeCursor::make( d, idxNo, i, key, key, true, 1 ) );
+                }
+            }
+        }
+        auto_ptr<MultiPlanScanner> mps( new MultiPlanScanner( ns, query, order ) ); // mayYield == false
+        shared_ptr<Cursor> single = mps->singleCursor();
+        if ( single ) {
+            if ( !( requireIndex &&
+                   dynamic_cast<BasicCursor*>( single.get() ) /* May not use an unindexed cursor */ ) ) {
+                if ( !query.isEmpty() && !single->matcher() ) {
+                    shared_ptr<CoveredIndexMatcher> matcher( new CoveredIndexMatcher( query, single->indexKeyPattern() ) );
+                    single->setMatcher( matcher );
+                }
+                if ( simpleEqualityMatch ) {
+                    const QueryPlan *qp = mps->singlePlan();
+                    if ( qp->exactKeyMatch() && !single->matcher()->needRecord() ) {
+                        *simpleEqualityMatch = true;
+                    }
+                }
+                return single;
+            }
+        }
+        return newQueryOptimizerCursor( mps, requireIndex );
+    }
+
+    /** This interface just available for testing. */
+    shared_ptr<Cursor> newQueryOptimizerCursor( const char *ns, const BSONObj &query, const BSONObj &order, bool requireIndex ) {
+        auto_ptr<MultiPlanScanner> mps( new MultiPlanScanner( ns, query, order ) ); // mayYield == false
+        return newQueryOptimizerCursor( mps, requireIndex );
+    }
+        
+} // namespace mongo;
diff --git a/src/mongo/db/queryoptimizercursor.h b/src/mongo/db/queryoptimizercursor.h
new file mode 100644
index 00000000000..ee5a1663370
--- /dev/null
+++ b/src/mongo/db/queryoptimizercursor.h
@@ -0,0 +1,150 @@
+// @file queryoptimizercursor.h
+
+/**
+ *    Copyright (C) 2011 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+namespace mongo {
+    
+    /** Helper class for caching and counting matches during execution of a QueryPlan. */
+    class CachedMatchCounter {
+    public:
+        /**
+         * @param aggregateNscanned - shared count of nscanned for this and othe plans.
+         * @param cumulativeCount - starting point for accumulated count over a series of plans.
+         */
+        CachedMatchCounter( long long &aggregateNscanned, int cumulativeCount ) : _aggregateNscanned( aggregateNscanned ), _nscanned(), _cumulativeCount( cumulativeCount ), _count(), _checkDups(), _match( Unknown ), _counted() {}
+        
+        /** Set whether dup checking is enabled when counting. */
+        void setCheckDups( bool checkDups ) { _checkDups = checkDups; }
+        
+        /**
+         * Usual sequence of events:
+         * 1) resetMatch() - reset stored match value to Unkonwn.
+         * 2) setMatch() - set match value to a definite true/false value.
+         * 3) knowMatch() - check if setMatch() has been called.
+         * 4) countMatch() - increment count if match is true.
+         */
+        
+        void resetMatch() {
+            _match = Unknown;
+            _counted = false;
+        }
+        void setMatch( bool match ) { _match = match ? True : False; }
+        bool knowMatch() const { return _match != Unknown; }
+        void countMatch( const DiskLoc &loc ) {
+            if ( !_counted && _match == True && !getsetdup( loc ) ) {
+                ++_cumulativeCount;
+                ++_count;
+                _counted = true;
+            }
+        }
+
+        bool enoughCumulativeMatchesToChooseAPlan() const {
+            // This is equivalent to the default condition for switching from
+            // a query to a getMore, which was the historical default match count for
+            // choosing a plan.
+            return _cumulativeCount >= 101;
+        }
+        bool enoughMatchesToRecordPlan() const {
+            // Recording after 50 matches is a historical default (101 default limit / 2).
+            return _count > 50;
+        }
+
+        int cumulativeCount() const { return _cumulativeCount; }
+        int count() const { return _count; }
+        
+        /** Update local and aggregate nscanned counts. */
+        void updateNscanned( long long nscanned ) {
+            _aggregateNscanned += ( nscanned - _nscanned );
+            _nscanned = nscanned;
+        }
+        long long nscanned() const { return _nscanned; }
+        long long &aggregateNscanned() const { return _aggregateNscanned; }
+    private:
+        bool getsetdup( const DiskLoc &loc ) {
+            if ( !_checkDups ) {
+                return false;
+            }
+            pair<set<DiskLoc>::iterator, bool> p = _dups.insert( loc );
+            return !p.second;
+        }
+        long long &_aggregateNscanned;
+        long long _nscanned;
+        int _cumulativeCount;
+        int _count;
+        bool _checkDups;
+        enum MatchState { Unknown, False, True };
+        MatchState _match;
+        bool _counted;
+        set<DiskLoc> _dups;
+    };
+    
+    /** Dup tracking class, optimizing one common case with small set and few initial reads. */
+    class SmallDupSet {
+    public:
+        SmallDupSet() : _accesses() {
+            _vec.reserve( 250 );
+        }
+        /** @return true if @param 'loc' already added to the set, false if adding to the set in this call. */
+        bool getsetdup( const DiskLoc &loc ) {
+            access();
+            return vec() ? getsetdupVec( loc ) : getsetdupSet( loc );
+        }
+        /** @return true when @param loc in the set. */
+        bool getdup( const DiskLoc &loc ) {
+            access();
+            return vec() ? getdupVec( loc ) : getdupSet( loc );
+        }            
+    private:
+        void access() {
+            ++_accesses;
+            mayUpgrade();
+        }
+        void mayUpgrade() {
+            if ( vec() && _accesses > 500 ) {
+                _set.insert( _vec.begin(), _vec.end() );
+            }
+        }
+        bool vec() const {
+            return _set.size() == 0;
+        }
+        bool getsetdupVec( const DiskLoc &loc ) {
+            if ( getdupVec( loc ) ) {
+                return true;
+            }
+            _vec.push_back( loc );
+            return false;
+        }
+        bool getdupVec( const DiskLoc &loc ) const {
+            for( vector<DiskLoc>::const_iterator i = _vec.begin(); i != _vec.end(); ++i ) {
+                if ( *i == loc ) {
+                    return true;
+                }
+            }
+            return false;
+        }
+        bool getsetdupSet( const DiskLoc &loc ) {
+            pair<set<DiskLoc>::iterator, bool> p = _set.insert(loc);
+            return !p.second;
+        }
+        bool getdupSet( const DiskLoc &loc ) {
+            return _set.count( loc ) > 0;
+        }
+        vector<DiskLoc> _vec;
+        set<DiskLoc> _set;
+        long long _accesses;
+    };
+} // namespace mongo
diff --git a/src/mongo/db/querypattern.cpp b/src/mongo/db/querypattern.cpp
new file mode 100644
index 00000000000..e20e2b6a6ae
--- /dev/null
+++ b/src/mongo/db/querypattern.cpp
@@ -0,0 +1,99 @@
+// @file querypattern.cpp - Query pattern matching for selecting similar plans given similar queries.
+
+/*    Copyright 2011 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "querypattern.h"
+
+namespace mongo {
+
+    QueryPattern::QueryPattern( const FieldRangeSet &frs, const BSONObj &sort ) {
+        for( map<string,FieldRange>::const_iterator i = frs.ranges().begin(); i != frs.ranges().end(); ++i ) {
+            if ( i->second.equality() ) {
+                _fieldTypes[ i->first ] = QueryPattern::Equality;
+            }
+            else if ( i->second.empty() ) {
+                // This case generally results from an upper and lower bound that are inconsistent for a single key index.
+                _fieldTypes[ i->first ] = QueryPattern::UpperAndLowerBound;
+            }
+            else if ( i->second.nontrivial() ) {
+                bool upper = i->second.max().type() != MaxKey;
+                bool lower = i->second.min().type() != MinKey;
+                if ( upper && lower )
+                    _fieldTypes[ i->first ] = QueryPattern::UpperAndLowerBound;
+                else if ( upper )
+                    _fieldTypes[ i->first ] = QueryPattern::UpperBound;
+                else if ( lower )
+                    _fieldTypes[ i->first ] = QueryPattern::LowerBound;
+            }
+        }
+        setSort( sort );
+    }
+
+    /** for testing only - speed unimportant */
+    bool QueryPattern::operator==( const QueryPattern &other ) const {
+        bool less = operator<( other );
+        bool more = other.operator<( *this );
+        assert( !( less && more ) );
+        return !( less || more );
+    }
+    
+    /** for testing only - speed unimportant */
+    bool QueryPattern::operator!=( const QueryPattern &other ) const {
+        return !operator==( other );
+    }
+    
+    string typeToString( enum QueryPattern::Type t ) {
+        switch (t) {
+            case QueryPattern::Equality:
+                return "Equality";
+            case QueryPattern::LowerBound:
+                return "LowerBound";
+            case QueryPattern::UpperBound:
+                return "UpperBound";
+            case QueryPattern::UpperAndLowerBound:
+                return "UpperAndLowerBound";
+        }
+        return "";
+    }
+    
+    string QueryPattern::toString() const {
+        BSONObjBuilder b;
+        for( map<string,Type>::const_iterator i = _fieldTypes.begin(); i != _fieldTypes.end(); ++i ) {
+            b << i->first << typeToString( i->second );
+        }
+        return BSON( "query" << b.done() << "sort" << _sort ).toString();
+    }
+    
+    void QueryPattern::setSort( const BSONObj sort ) {
+        _sort = normalizeSort( sort );
+    }
+    
+    BSONObj QueryPattern::normalizeSort( const BSONObj &spec ) {
+        if ( spec.isEmpty() )
+            return spec;
+        int direction = ( spec.firstElement().number() >= 0 ) ? 1 : -1;
+        BSONObjIterator i( spec );
+        BSONObjBuilder b;
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            b.append( e.fieldName(), direction * ( ( e.number() >= 0 ) ? -1 : 1 ) );
+        }
+        return b.obj();
+    }
+    
+} // namespace mongo
diff --git a/src/mongo/db/querypattern.h b/src/mongo/db/querypattern.h
new file mode 100644
index 00000000000..000c301a0de
--- /dev/null
+++ b/src/mongo/db/querypattern.h
@@ -0,0 +1,78 @@
+// @file querypattern.h - Query pattern matching for selecting similar plans given similar queries.
+
+/*    Copyright 2011 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+#include "queryutil.h"
+
+namespace mongo {
+
+    /**
+     * Implements query pattern matching, used to determine if a query is
+     * similar to an earlier query and should use the same plan.
+     *
+     * Two queries will generate the same QueryPattern, and therefore match each
+     * other, if their fields have the same Types and they have the same sort
+     * spec.
+     */
+    class QueryPattern {
+    public:
+        QueryPattern( const FieldRangeSet &frs, const BSONObj &sort );
+        enum Type {
+            Equality,
+            LowerBound,
+            UpperBound,
+            UpperAndLowerBound
+        };
+        bool operator<( const QueryPattern &other ) const;
+        /** for testing only */
+        bool operator==( const QueryPattern &other ) const;
+        /** for testing only */
+        bool operator!=( const QueryPattern &other ) const;
+        /** for development / debugging */
+        string toString() const;
+    private:
+        void setSort( const BSONObj sort );
+        static BSONObj normalizeSort( const BSONObj &spec );
+        map<string,Type> _fieldTypes;
+        BSONObj _sort;
+    };
+
+    inline bool QueryPattern::operator<( const QueryPattern &other ) const {
+        map<string,Type>::const_iterator i = _fieldTypes.begin();
+        map<string,Type>::const_iterator j = other._fieldTypes.begin();
+        while( i != _fieldTypes.end() ) {
+            if ( j == other._fieldTypes.end() )
+                return false;
+            if ( i->first < j->first )
+                return true;
+            else if ( i->first > j->first )
+                return false;
+            if ( i->second < j->second )
+                return true;
+            else if ( i->second > j->second )
+                return false;
+            ++i;
+            ++j;
+        }
+        if ( j != other._fieldTypes.end() )
+            return true;
+        return _sort.woCompare( other._sort ) < 0;
+    }
+        
+} // namespace mongo
diff --git a/src/mongo/db/queryutil-inl.h b/src/mongo/db/queryutil-inl.h
new file mode 100644
index 00000000000..08d3b1fac52
--- /dev/null
+++ b/src/mongo/db/queryutil-inl.h
@@ -0,0 +1,153 @@
+// @file queryutil-inl.h - Inline definitions for frequently called queryutil.h functions
+
+/*    Copyright 2011 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+namespace mongo {
+    
+    inline bool FieldInterval::equality() const {
+        if ( _cachedEquality == -1 ) {
+            _cachedEquality = ( _lower._inclusive && _upper._inclusive && _lower._bound.woCompare( _upper._bound, false ) == 0 );
+        }
+        return _cachedEquality != 0;
+    }
+
+    inline bool FieldRange::equality() const {
+        return
+            !empty() &&
+            min().woCompare( max(), false ) == 0 &&
+            maxInclusive() &&
+            minInclusive();
+    }
+
+    inline bool FieldRange::inQuery() const {
+        if ( equality() ) {
+            return true;
+        }
+        for( vector<FieldInterval>::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
+            if ( !i->equality() ) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * TODO Assumes intervals are contiguous and minKey/maxKey will not be
+     * matched against.
+     */
+    inline bool FieldRange::nontrivial() const {
+        return
+            ! empty() &&
+            ( _intervals.size() != 1 ||
+              minKey.firstElement().woCompare( min(), false ) != 0 ||
+              maxKey.firstElement().woCompare( max(), false ) != 0 );
+    }
+
+    inline const FieldRange &FieldRangeSet::range( const char *fieldName ) const {
+        map<string,FieldRange>::const_iterator f = _ranges.find( fieldName );
+        if ( f == _ranges.end() )
+            return trivialRange();
+        return f->second;
+    }
+
+    inline FieldRange &FieldRangeSet::range( const char *fieldName ) {
+        map<string,FieldRange>::iterator f = _ranges.find( fieldName );
+        if ( f == _ranges.end() ) {
+            _ranges.insert( make_pair( string( fieldName ), trivialRange() ) );
+            return _ranges.find( fieldName )->second;
+        }
+        return f->second;
+    }
+
+    inline int FieldRangeSet::nNontrivialRanges() const {
+        int count = 0;
+        for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            if ( i->second.nontrivial() )
+                ++count;
+        }
+        return count;
+    }
+
+    inline bool FieldRangeSet::matchPossible() const {
+        for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            if ( i->second.empty() ) {
+                return false;
+            }
+        }
+        return true;
+    }
+    
+    inline bool FieldRangeSet::matchPossibleForIndex( const BSONObj &keyPattern ) const {
+        if ( !_singleKey ) {
+            return matchPossible();   
+        }
+        BSONObjIterator i( keyPattern );
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if ( e.fieldName() == string( "$natural" ) ) {
+                return true;
+            }
+            if ( range( e.fieldName() ).empty() ) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    inline long long FieldRangeVector::size() {
+        long long ret = 1;
+        for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            ret *= i->intervals().size();
+        }
+        return ret;
+    }
+
+    inline FieldRangeSetPair *OrRangeGenerator::topFrsp() const {
+        FieldRangeSetPair *ret = new FieldRangeSetPair( _baseSet );
+        if (_orSets.size()) {
+            *ret &= _orSets.front();
+        }
+        return ret;
+    }
+
+    inline FieldRangeSetPair *OrRangeGenerator::topFrspOriginal() const {
+        FieldRangeSetPair *ret = new FieldRangeSetPair( _baseSet );
+        if (_originalOrSets.size()) {
+            *ret &= _originalOrSets.front();
+        }
+        return ret;
+    }
+    
+    inline bool FieldRangeSetPair::matchPossibleForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const {
+        assertValidIndexOrNoIndex( d, idxNo );
+        if ( !matchPossible() ) {
+            return false;
+        }
+        if ( idxNo < 0 ) {
+            // multi key matchPossible() is true, so return true.
+            return true;   
+        }
+        return frsForIndex( d, idxNo ).matchPossibleForIndex( keyPattern );
+    }
+
+    inline void FieldRangeSetPair::assertValidIndexOrNoIndex( const NamespaceDetails *d, int idxNo ) const {
+        massert( 14049, "FieldRangeSetPair invalid index specified", idxNo >= -1 );
+        if ( idxNo >= 0 ) {
+            assertValidIndex( d, idxNo );   
+        }
+    }        
+    
+} // namespace mongo
diff --git a/src/mongo/db/queryutil.cpp b/src/mongo/db/queryutil.cpp
new file mode 100644
index 00000000000..e6748c4bc2e
--- /dev/null
+++ b/src/mongo/db/queryutil.cpp
@@ -0,0 +1,1551 @@
+// @file queryutil.cpp
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include "pch.h"
+
+#include "btree.h"
+#include "matcher.h"
+#include "pdfile.h"
+#include "queryoptimizer.h"
+#include "../util/unittest.h"
+#include "dbmessage.h"
+#include "indexkey.h"
+#include "../util/mongoutils/str.h"
+
+namespace mongo {
+    extern BSONObj staticNull;
+    extern BSONObj staticUndefined;
+
+    /** returns a string that when used as a matcher, would match a super set of regex()
+        returns "" for complex regular expressions
+        used to optimize queries in some simple regex cases that start with '^'
+
+        if purePrefix != NULL, sets it to whether the regex can be converted to a range query
+    */
+    string simpleRegex(const char* regex, const char* flags, bool* purePrefix) {
+        string r = "";
+
+        if (purePrefix) *purePrefix = false;
+
+        bool multilineOK;
+        if ( regex[0] == '\\' && regex[1] == 'A') {
+            multilineOK = true;
+            regex += 2;
+        }
+        else if (regex[0] == '^') {
+            multilineOK = false;
+            regex += 1;
+        }
+        else {
+            return r;
+        }
+
+        bool extended = false;
+        while (*flags) {
+            switch (*(flags++)) {
+            case 'm': // multiline
+                if (multilineOK)
+                    continue;
+                else
+                    return r;
+            case 'x': // extended
+                extended = true;
+                break;
+            default:
+                return r; // cant use index
+            }
+        }
+
+        stringstream ss;
+
+        while(*regex) {
+            char c = *(regex++);
+            if ( c == '*' || c == '?' ) {
+                // These are the only two symbols that make the last char optional
+                r = ss.str();
+                r = r.substr( 0 , r.size() - 1 );
+                return r; //breaking here fails with /^a?/
+            }
+            else if (c == '|') {
+                // whole match so far is optional. Nothing we can do here.
+                return string();
+            }
+            else if (c == '\\') {
+                c = *(regex++);
+                if (c == 'Q'){
+                    // \Q...\E quotes everything inside
+                    while (*regex) {
+                        c = (*regex++);
+                        if (c == '\\' && (*regex == 'E')){
+                            regex++; //skip the 'E'
+                            break; // go back to start of outer loop
+                        }
+                        else {
+                            ss << c; // character should match itself
+                        }
+                    }
+                }
+                else if ((c >= 'A' && c <= 'Z') ||
+                        (c >= 'a' && c <= 'z') ||
+                        (c >= '0' && c <= '0') ||
+                        (c == '\0')) {
+                    // don't know what to do with these
+                    r = ss.str();
+                    break;
+                }
+                else {
+                    // slash followed by non-alphanumeric represents the following char
+                    ss << c;
+                }
+            }
+            else if (strchr("^$.[()+{", c)) {
+                // list of "metacharacters" from man pcrepattern
+                r = ss.str();
+                break;
+            }
+            else if (extended && c == '#') {
+                // comment
+                r = ss.str();
+                break;
+            }
+            else if (extended && isspace(c)) {
+                continue;
+            }
+            else {
+                // self-matching char
+                ss << c;
+            }
+        }
+
+        if ( r.empty() && *regex == 0 ) {
+            r = ss.str();
+            if (purePrefix) *purePrefix = !r.empty();
+        }
+
+        return r;
+    }
+    inline string simpleRegex(const BSONElement& e) {
+        switch(e.type()) {
+        case RegEx:
+            return simpleRegex(e.regex(), e.regexFlags());
+        case Object: {
+            BSONObj o = e.embeddedObject();
+            return simpleRegex(o["$regex"].valuestrsafe(), o["$options"].valuestrsafe());
+        }
+        default: assert(false); return ""; //return squashes compiler warning
+        }
+    }
+
+    string simpleRegexEnd( string regex ) {
+        ++regex[ regex.length() - 1 ];
+        return regex;
+    }
+
+
+    FieldRange::FieldRange( const BSONElement &e, bool singleKey, bool isNot, bool optimize )
+    : _singleKey( singleKey ) {
+        int op = e.getGtLtOp();
+
+        // NOTE with $not, we could potentially form a complementary set of intervals.
+        if ( !isNot && !e.eoo() && e.type() != RegEx && op == BSONObj::opIN ) {
+            set<BSONElement,element_lt> vals;
+            vector<FieldRange> regexes;
+            uassert( 12580 , "invalid query" , e.isABSONObj() );
+            BSONObjIterator i( e.embeddedObject() );
+            while( i.more() ) {
+                BSONElement ie = i.next();
+                uassert( 15881, "$elemMatch not allowed within $in",
+                         ie.type() != Object ||
+                         ie.embeddedObject().firstElement().getGtLtOp() != BSONObj::opELEM_MATCH );
+                if ( ie.type() == RegEx ) {
+                    regexes.push_back( FieldRange( ie, singleKey, false, optimize ) );
+                }
+                else {
+                    // A document array may be indexed by its first element, by undefined
+                    // if it is empty, or as a full array if it is embedded within another
+                    // array.
+                    vals.insert( ie );                        
+                    if ( ie.type() == Array ) {
+                        BSONElement temp = ie.embeddedObject().firstElement();
+                        if ( temp.eoo() ) {
+                            temp = staticUndefined.firstElement();
+                        }                        
+                        vals.insert( temp );
+                    }
+                }
+            }
+
+            for( set<BSONElement,element_lt>::const_iterator i = vals.begin(); i != vals.end(); ++i )
+                _intervals.push_back( FieldInterval(*i) );
+
+            for( vector<FieldRange>::const_iterator i = regexes.begin(); i != regexes.end(); ++i )
+                *this |= *i;
+
+            return;
+        }
+
+        // A document array may be indexed by its first element, by undefined
+        // if it is empty, or as a full array if it is embedded within another
+        // array.
+        if ( e.type() == Array && op == BSONObj::Equality ) {
+
+            _intervals.push_back( FieldInterval(e) );
+            BSONElement temp = e.embeddedObject().firstElement();
+            if ( temp.eoo() ) {
+             	temp = staticUndefined.firstElement();
+            }
+            if ( temp < e ) {
+                _intervals.insert( _intervals.begin() , temp );
+            }
+            else {
+                _intervals.push_back( FieldInterval(temp) );
+            }
+
+            return;
+        }
+
+        _intervals.push_back( FieldInterval() );
+        FieldInterval &initial = _intervals[ 0 ];
+        BSONElement &lower = initial._lower._bound;
+        bool &lowerInclusive = initial._lower._inclusive;
+        BSONElement &upper = initial._upper._bound;
+        bool &upperInclusive = initial._upper._inclusive;
+        lower = minKey.firstElement();
+        lowerInclusive = true;
+        upper = maxKey.firstElement();
+        upperInclusive = true;
+
+        if ( e.eoo() )
+            return;
+
+        bool existsSpec = false;
+        if ( op == BSONObj::opEXISTS ) {
+            existsSpec = e.trueValue();
+        }
+        
+        if ( e.type() == RegEx
+                || (e.type() == Object && !e.embeddedObject()["$regex"].eoo())
+           ) {
+            uassert( 13454, "invalid regular expression operator", op == BSONObj::Equality || op == BSONObj::opREGEX );
+            if ( !isNot ) { // no optimization for negated regex - we could consider creating 2 intervals comprising all nonmatching prefixes
+                const string r = simpleRegex(e);
+                if ( r.size() ) {
+                    lower = addObj( BSON( "" << r ) ).firstElement();
+                    upper = addObj( BSON( "" << simpleRegexEnd( r ) ) ).firstElement();
+                    upperInclusive = false;
+                }
+                else {
+                    BSONObjBuilder b1(32), b2(32);
+                    b1.appendMinForType( "" , String );
+                    lower = addObj( b1.obj() ).firstElement();
+
+                    b2.appendMaxForType( "" , String );
+                    upper = addObj( b2.obj() ).firstElement();
+                    upperInclusive = false; //MaxForType String is an empty Object
+                }
+
+                // regex matches self - regex type > string type
+                if (e.type() == RegEx) {
+                    BSONElement re = addObj( BSON( "" << e ) ).firstElement();
+                    _intervals.push_back( FieldInterval(re) );
+                }
+                else {
+                    BSONObj orig = e.embeddedObject();
+                    BSONObjBuilder b;
+                    b.appendRegex("", orig["$regex"].valuestrsafe(), orig["$options"].valuestrsafe());
+                    BSONElement re = addObj( b.obj() ).firstElement();
+                    _intervals.push_back( FieldInterval(re) );
+                }
+
+            }
+            return;
+        }
+        if ( isNot ) {
+            switch( op ) {
+            case BSONObj::Equality:
+                return;
+//                    op = BSONObj::NE;
+//                    break;
+            case BSONObj::opALL:
+            case BSONObj::opMOD: // NOTE for mod and type, we could consider having 1-2 intervals comprising the complementary types (multiple intervals already possible with $in)
+            case BSONObj::opTYPE:
+                // no bound calculation
+                return;
+            case BSONObj::NE:
+                op = BSONObj::Equality;
+                break;
+            case BSONObj::LT:
+                op = BSONObj::GTE;
+                break;
+            case BSONObj::LTE:
+                op = BSONObj::GT;
+                break;
+            case BSONObj::GT:
+                op = BSONObj::LTE;
+                break;
+            case BSONObj::GTE:
+                op = BSONObj::LT;
+                break;
+            case BSONObj::opEXISTS:
+                existsSpec = !existsSpec;
+                break;
+            default: // otherwise doesn't matter
+                break;
+            }
+        }
+        switch( op ) {
+        case BSONObj::Equality:
+            lower = upper = e;
+            break;
+        case BSONObj::NE: {
+            // this will invalidate the upper/lower references above
+            _intervals.push_back( FieldInterval() );
+            // optimize doesn't make sense for negative ranges
+            _intervals[ 0 ]._upper._bound = e;
+            _intervals[ 0 ]._upper._inclusive = false;
+            _intervals[ 1 ]._lower._bound = e;
+            _intervals[ 1 ]._lower._inclusive = false;
+            _intervals[ 1 ]._upper._bound = maxKey.firstElement();
+            _intervals[ 1 ]._upper._inclusive = true;
+            optimize = false; // don't run optimize code below
+            break;
+        }
+        case BSONObj::LT:
+            upperInclusive = false;
+        case BSONObj::LTE:
+            upper = e;
+            break;
+        case BSONObj::GT:
+            lowerInclusive = false;
+        case BSONObj::GTE:
+            lower = e;
+            break;
+        case BSONObj::opALL: {
+            uassert( 10370 ,  "$all requires array", e.type() == Array );
+            BSONObjIterator i( e.embeddedObject() );
+            bool bound = false;
+            while ( i.more() ) {
+                BSONElement x = i.next();
+                if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) {
+                    // taken care of elsewhere
+                }
+                else if ( x.type() != RegEx ) {
+                    lower = upper = x;
+                    bound = true;
+                    break;
+                }
+            }
+            if ( !bound ) { // if no good non regex bound found, try regex bounds
+                BSONObjIterator i( e.embeddedObject() );
+                while( i.more() ) {
+                    BSONElement x = i.next();
+                    if ( x.type() != RegEx )
+                        continue;
+                    string simple = simpleRegex( x.regex(), x.regexFlags() );
+                    if ( !simple.empty() ) {
+                        lower = addObj( BSON( "" << simple ) ).firstElement();
+                        upper = addObj( BSON( "" << simpleRegexEnd( simple ) ) ).firstElement();
+                        break;
+                    }
+                }
+            }
+            break;
+        }
+        case BSONObj::opMOD: {
+            {
+                BSONObjBuilder b;
+                b.appendMinForType( "" , NumberDouble );
+                lower = addObj( b.obj() ).firstElement();
+            }
+            {
+                BSONObjBuilder b;
+                b.appendMaxForType( "" , NumberDouble );
+                upper = addObj( b.obj() ).firstElement();
+            }
+            break;
+        }
+        case BSONObj::opTYPE: {
+            BSONType t = (BSONType)e.numberInt();
+            {
+                BSONObjBuilder b;
+                b.appendMinForType( "" , t );
+                lower = addObj( b.obj() ).firstElement();
+            }
+            {
+                BSONObjBuilder b;
+                b.appendMaxForType( "" , t );
+                upper = addObj( b.obj() ).firstElement();
+            }
+
+            break;
+        }
+        case BSONObj::opREGEX:
+        case BSONObj::opOPTIONS:
+            // do nothing
+            break;
+        case BSONObj::opELEM_MATCH: {
+            log() << "warning: shouldn't get here?" << endl;
+            break;
+        }
+        case BSONObj::opNEAR:
+        case BSONObj::opWITHIN:
+            _special = "2d";
+            break;
+        case BSONObj::opEXISTS: {
+            if ( !existsSpec ) {
+                lower = upper = staticNull.firstElement();
+            }
+            optimize = false;
+            break;
+        }
+        default:
+            break;
+        }
+
+        if ( optimize ) {
+            if ( lower.type() != MinKey && upper.type() == MaxKey && lower.isSimpleType() ) { // TODO: get rid of isSimpleType
+                BSONObjBuilder b;
+                b.appendMaxForType( lower.fieldName() , lower.type() );
+                upper = addObj( b.obj() ).firstElement();
+            }
+            else if ( lower.type() == MinKey && upper.type() != MaxKey && upper.isSimpleType() ) { // TODO: get rid of isSimpleType
+                if( upper.type() == Date ) 
+                    lowerInclusive = false;
+                BSONObjBuilder b;
+                b.appendMinForType( upper.fieldName() , upper.type() );
+                lower = addObj( b.obj() ).firstElement();
+            }
+        }
+
+    }
+
+    void FieldRange::finishOperation( const vector<FieldInterval> &newIntervals, const FieldRange &other ) {
+        _intervals = newIntervals;
+        for( vector<BSONObj>::const_iterator i = other._objData.begin(); i != other._objData.end(); ++i )
+            _objData.push_back( *i );
+        if ( _special.size() == 0 && other._special.size() )
+            _special = other._special;
+    }
+
+    // as called, these functions find the max/min of a bound in the
+    // opposite direction, so inclusive bounds are considered less
+    // superlative
+    FieldBound maxFieldBound( const FieldBound &a, const FieldBound &b ) {
+        int cmp = a._bound.woCompare( b._bound, false );
+        if ( ( cmp == 0 && !b._inclusive ) || cmp < 0 )
+            return b;
+        return a;
+    }
+
+    FieldBound minFieldBound( const FieldBound &a, const FieldBound &b ) {
+        int cmp = a._bound.woCompare( b._bound, false );
+        if ( ( cmp == 0 && !b._inclusive ) || cmp > 0 )
+            return b;
+        return a;
+    }
+
+    bool fieldIntervalOverlap( const FieldInterval &one, const FieldInterval &two, FieldInterval &result ) {
+        result._lower = maxFieldBound( one._lower, two._lower );
+        result._upper = minFieldBound( one._upper, two._upper );
+        return result.strictValid();
+    }
+
+    const FieldRange &FieldRange::operator&=( const FieldRange &other ) {
+        if ( !_singleKey && nontrivial() ) {
+            if ( other <= *this ) {
+             	*this = other;
+            }
+            return *this;
+        }
+        vector<FieldInterval> newIntervals;
+        vector<FieldInterval>::const_iterator i = _intervals.begin();
+        vector<FieldInterval>::const_iterator j = other._intervals.begin();
+        while( i != _intervals.end() && j != other._intervals.end() ) {
+            FieldInterval overlap;
+            if ( fieldIntervalOverlap( *i, *j, overlap ) ) {
+                newIntervals.push_back( overlap );
+            }
+            if ( i->_upper == minFieldBound( i->_upper, j->_upper ) ) {
+                ++i;
+            }
+            else {
+                ++j;
+            }
+        }
+        finishOperation( newIntervals, other );
+        return *this;
+    }
+
+    void handleInterval( const FieldInterval &lower, FieldBound &low, FieldBound &high, vector<FieldInterval> &newIntervals ) {
+        if ( low._bound.eoo() ) {
+            low = lower._lower; high = lower._upper;
+        }
+        else {
+            int cmp = high._bound.woCompare( lower._lower._bound, false );
+            if ( ( cmp < 0 ) || ( cmp == 0 && !high._inclusive && !lower._lower._inclusive ) ) {
+                FieldInterval tmp;
+                tmp._lower = low;
+                tmp._upper = high;
+                newIntervals.push_back( tmp );
+                low = lower._lower; high = lower._upper;
+            }
+            else {
+                high = lower._upper;
+            }
+        }
+    }
+
+    const FieldRange &FieldRange::operator|=( const FieldRange &other ) {
+        vector<FieldInterval> newIntervals;
+        FieldBound low;
+        FieldBound high;
+        vector<FieldInterval>::const_iterator i = _intervals.begin();
+        vector<FieldInterval>::const_iterator j = other._intervals.begin();
+        while( i != _intervals.end() && j != other._intervals.end() ) {
+            int cmp = i->_lower._bound.woCompare( j->_lower._bound, false );
+            if ( ( cmp == 0 && i->_lower._inclusive ) || cmp < 0 ) {
+                handleInterval( *i, low, high, newIntervals );
+                ++i;
+            }
+            else {
+                handleInterval( *j, low, high, newIntervals );
+                ++j;
+            }
+        }
+        while( i != _intervals.end() ) {
+            handleInterval( *i, low, high, newIntervals );
+            ++i;
+        }
+        while( j != other._intervals.end() ) {
+            handleInterval( *j, low, high, newIntervals );
+            ++j;
+        }
+        FieldInterval tmp;
+        tmp._lower = low;
+        tmp._upper = high;
+        newIntervals.push_back( tmp );
+        finishOperation( newIntervals, other );
+        return *this;
+    }
+
+    const FieldRange &FieldRange::operator-=( const FieldRange &other ) {
+        vector<FieldInterval> newIntervals;
+        vector<FieldInterval>::iterator i = _intervals.begin();
+        vector<FieldInterval>::const_iterator j = other._intervals.begin();
+        while( i != _intervals.end() && j != other._intervals.end() ) {
+            int cmp = i->_lower._bound.woCompare( j->_lower._bound, false );
+            if ( cmp < 0 ||
+                    ( cmp == 0 && i->_lower._inclusive && !j->_lower._inclusive ) ) {
+                int cmp2 = i->_upper._bound.woCompare( j->_lower._bound, false );
+                if ( cmp2 < 0 ) {
+                    newIntervals.push_back( *i );
+                    ++i;
+                }
+                else if ( cmp2 == 0 ) {
+                    newIntervals.push_back( *i );
+                    if ( newIntervals.back()._upper._inclusive && j->_lower._inclusive ) {
+                        newIntervals.back()._upper._inclusive = false;
+                    }
+                    ++i;
+                }
+                else {
+                    newIntervals.push_back( *i );
+                    newIntervals.back()._upper = j->_lower;
+                    newIntervals.back()._upper.flipInclusive();
+                    int cmp3 = i->_upper._bound.woCompare( j->_upper._bound, false );
+                    if ( cmp3 < 0 ||
+                            ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) {
+                        ++i;
+                    }
+                    else {
+                        i->_lower = j->_upper;
+                        i->_lower.flipInclusive();
+                        ++j;
+                    }
+                }
+            }
+            else {
+                int cmp2 = i->_lower._bound.woCompare( j->_upper._bound, false );
+                if ( cmp2 > 0 ||
+                        ( cmp2 == 0 && ( !i->_lower._inclusive || !j->_upper._inclusive ) ) ) {
+                    ++j;
+                }
+                else {
+                    int cmp3 = i->_upper._bound.woCompare( j->_upper._bound, false );
+                    if ( cmp3 < 0 ||
+                            ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) {
+                        ++i;
+                    }
+                    else {
+                        i->_lower = j->_upper;
+                        i->_lower.flipInclusive();
+                        ++j;
+                    }
+                }
+            }
+        }
+        while( i != _intervals.end() ) {
+            newIntervals.push_back( *i );
+            ++i;
+        }
+        finishOperation( newIntervals, other );
+        return *this;
+    }
+
+    // TODO write a proper implementation that doesn't do a full copy
+    bool FieldRange::operator<=( const FieldRange &other ) const {
+        FieldRange temp = *this;
+        temp -= other;
+        return temp.empty();
+    }
+
+    void FieldRange::setExclusiveBounds() {
+        for( vector<FieldInterval>::iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
+            i->_lower._inclusive = false;
+            i->_upper._inclusive = false;
+        }
+    }
+
+    void FieldRange::reverse( FieldRange &ret ) const {
+        assert( _special.empty() );
+        ret._intervals.clear();
+        ret._objData = _objData;
+        for( vector<FieldInterval>::const_reverse_iterator i = _intervals.rbegin(); i != _intervals.rend(); ++i ) {
+            FieldInterval fi;
+            fi._lower = i->_upper;
+            fi._upper = i->_lower;
+            ret._intervals.push_back( fi );
+        }
+    }
+    
+    BSONObj FieldRange::addObj( const BSONObj &o ) {
+        _objData.push_back( o );
+        return o;
+    }
+
+    string FieldInterval::toString() const {
+        StringBuilder buf;
+        buf << ( _lower._inclusive ? "[" : "(" );
+        buf << _lower._bound;
+        buf << " , ";
+        buf << _upper._bound;
+        buf << ( _upper._inclusive ? "]" : ")" );
+        return buf.str();
+    }
+
+    string FieldRange::toString() const {
+        StringBuilder buf;
+        buf << "(FieldRange special: " << _special << " singleKey: " << _special << " intervals: ";
+        for( vector<FieldInterval>::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) {
+            buf << i->toString();
+        }
+
+        buf << ")";
+        return buf.str();
+    }
+
+    string FieldRangeSet::getSpecial() const {
+        string s = "";
+        for ( map<string,FieldRange>::const_iterator i=_ranges.begin(); i!=_ranges.end(); i++ ) {
+            if ( i->second.getSpecial().size() == 0 )
+                continue;
+            uassert( 13033 , "can't have 2 special fields" , s.size() == 0 );
+            s = i->second.getSpecial();
+        }
+        return s;
+    }
+
+    /**
+     * Btree scanning for a multidimentional key range will yield a
+     * multidimensional box.  The idea here is that if an 'other'
+     * multidimensional box contains the current box we don't have to scan
+     * the current box.  If the 'other' box contains the current box in
+     * all dimensions but one, we can safely subtract the values of 'other'
+     * along that one dimension from the values for the current box on the
+     * same dimension.  In other situations, subtracting the 'other'
+     * box from the current box yields a result that is not a box (but
+     * rather can be expressed as a union of boxes).  We don't support
+     * such splitting currently in calculating index ranges.  Note that
+     * where I have said 'box' above, I actually mean sets of boxes because
+     * a field range can consist of multiple intervals.
+     */    
+    const FieldRangeSet &FieldRangeSet::operator-=( const FieldRangeSet &other ) {
+        int nUnincluded = 0;
+        string unincludedKey;
+        map<string,FieldRange>::iterator i = _ranges.begin();
+        map<string,FieldRange>::const_iterator j = other._ranges.begin();
+        while( nUnincluded < 2 && i != _ranges.end() && j != other._ranges.end() ) {
+            int cmp = i->first.compare( j->first );
+            if ( cmp == 0 ) {
+                if ( i->second <= j->second ) {
+                    // nothing
+                }
+                else {
+                    ++nUnincluded;
+                    unincludedKey = i->first;
+                }
+                ++i;
+                ++j;
+            }
+            else if ( cmp < 0 ) {
+                ++i;
+            }
+            else {
+                // other has a bound we don't, nothing can be done
+                return *this;
+            }
+        }
+        if ( j != other._ranges.end() ) {
+            // other has a bound we don't, nothing can be done
+            return *this;
+        }
+        if ( nUnincluded > 1 ) {
+            return *this;
+        }
+        if ( nUnincluded == 0 ) {
+            makeEmpty();
+            return *this;
+        }
+        // nUnincluded == 1
+        range( unincludedKey.c_str() ) -= other.range( unincludedKey.c_str() );
+        appendQueries( other );
+        return *this;
+    }
+    
+    const FieldRangeSet &FieldRangeSet::operator&=( const FieldRangeSet &other ) {
+        map<string,FieldRange>::iterator i = _ranges.begin();
+        map<string,FieldRange>::const_iterator j = other._ranges.begin();
+        while( i != _ranges.end() && j != other._ranges.end() ) {
+            int cmp = i->first.compare( j->first );
+            if ( cmp == 0 ) {
+                // Same field name, so find range intersection.
+                i->second &= j->second;
+                ++i;
+                ++j;
+            }
+            else if ( cmp < 0 ) {
+                // Field present in *this.
+                ++i;
+            }
+            else {
+                // Field not present in *this, so add it.
+                range( j->first.c_str() ) = j->second;
+                ++j;
+            }
+        }
+        while( j != other._ranges.end() ) {
+            // Field not present in *this, add it.
+            range( j->first.c_str() ) = j->second;
+            ++j;
+        }
+        appendQueries( other );
+        return *this;
+    }    
+    
+    void FieldRangeSet::appendQueries( const FieldRangeSet &other ) {
+        for( vector<BSONObj>::const_iterator i = other._queries.begin(); i != other._queries.end(); ++i ) {
+            _queries.push_back( *i );
+        }
+    }
+    
+    void FieldRangeSet::makeEmpty() {
+        for( map<string,FieldRange>::iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            i->second.makeEmpty();
+        }
+    }    
+    
+    void FieldRangeSet::processOpElement( const char *fieldName, const BSONElement &f, bool isNot, bool optimize ) {
+        BSONElement g = f;
+        int op2 = g.getGtLtOp();
+        if ( op2 == BSONObj::opALL ) {
+            BSONElement h = g;
+            uassert( 13050 ,  "$all requires array", h.type() == Array );
+            BSONObjIterator i( h.embeddedObject() );
+            if( i.more() ) {
+                BSONElement x = i.next();
+                if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) {
+                    g = x.embeddedObject().firstElement();
+                    op2 = g.getGtLtOp();
+                }
+            }
+        }
+        if ( op2 == BSONObj::opELEM_MATCH ) {
+            BSONObjIterator k( g.embeddedObjectUserCheck() );
+            while ( k.more() ) {
+                BSONElement h = k.next();
+                StringBuilder buf(32);
+                buf << fieldName << "." << h.fieldName();
+                string fullname = buf.str();
+
+                int op3 = getGtLtOp( h );
+                if ( op3 == BSONObj::Equality ) {
+                    range( fullname.c_str() ) &= FieldRange( h , _singleKey , isNot , optimize );
+                }
+                else {
+                    BSONObjIterator l( h.embeddedObject() );
+                    while ( l.more() ) {
+                        range( fullname.c_str() ) &= FieldRange( l.next() , _singleKey , isNot , optimize );
+                    }
+                }
+            }
+        }
+        else {
+            range( fieldName ) &= FieldRange( f , _singleKey , isNot , optimize );
+        }
+    }
+
+    void FieldRangeSet::processQueryField( const BSONElement &e, bool optimize ) {
+        if ( e.fieldName()[ 0 ] == '$' ) {
+            if ( strcmp( e.fieldName(), "$and" ) == 0 ) {
+                uassert( 14816 , "$and expression must be a nonempty array" , e.type() == Array && e.embeddedObject().nFields() > 0 );
+                BSONObjIterator i( e.embeddedObject() );
+                while( i.more() ) {
+                    BSONElement e = i.next();
+                    uassert( 14817 , "$and elements must be objects" , e.type() == Object );
+                    BSONObjIterator j( e.embeddedObject() );
+                    while( j.more() ) {
+                        processQueryField( j.next(), optimize );
+                    }
+                }            
+            }
+        
+            if ( strcmp( e.fieldName(), "$where" ) == 0 ) {
+                return;
+            }
+        
+            if ( strcmp( e.fieldName(), "$or" ) == 0 ) {
+                return;
+            }
+        
+            if ( strcmp( e.fieldName(), "$nor" ) == 0 ) {
+                return;
+            }
+        }
+        
+        bool equality = ( getGtLtOp( e ) == BSONObj::Equality );
+        if ( equality && e.type() == Object ) {
+            equality = ( strcmp( e.embeddedObject().firstElementFieldName(), "$not" ) != 0 );
+        }
+
+        if ( equality || ( e.type() == Object && !e.embeddedObject()[ "$regex" ].eoo() ) ) {
+            range( e.fieldName() ) &= FieldRange( e , _singleKey , false , optimize );
+        }
+        if ( !equality ) {
+            BSONObjIterator j( e.embeddedObject() );
+            while( j.more() ) {
+                BSONElement f = j.next();
+                if ( strcmp( f.fieldName(), "$not" ) == 0 ) {
+                    switch( f.type() ) {
+                    case Object: {
+                        BSONObjIterator k( f.embeddedObject() );
+                        while( k.more() ) {
+                            BSONElement g = k.next();
+                            uassert( 13034, "invalid use of $not", g.getGtLtOp() != BSONObj::Equality );
+                            processOpElement( e.fieldName(), g, true, optimize );
+                        }
+                        break;
+                    }
+                    case RegEx:
+                        processOpElement( e.fieldName(), f, true, optimize );
+                        break;
+                    default:
+                        uassert( 13041, "invalid use of $not", false );
+                    }
+                }
+                else {
+                    processOpElement( e.fieldName(), f, false, optimize );
+                }
+            }
+        }
+    }
+
+    FieldRangeSet::FieldRangeSet( const char *ns, const BSONObj &query, bool singleKey, bool optimize )
+        : _ns( ns ), _queries( 1, query.getOwned() ), _singleKey( singleKey ) {
+        BSONObjIterator i( _queries[ 0 ] );
+
+        while( i.more() ) {
+            processQueryField( i.next(), optimize );
+        }
+    }
+    
+    FieldRangeVector::FieldRangeVector( const FieldRangeSet &frs, const IndexSpec &indexSpec, int direction )
+    :_indexSpec( indexSpec ), _direction( direction >= 0 ? 1 : -1 ) {
+        _queries = frs._queries;
+        BSONObjIterator i( _indexSpec.keyPattern );
+        set< string > baseObjectNontrivialPrefixes;
+        while( i.more() ) {
+            BSONElement e = i.next();
+            const FieldRange *range = &frs.range( e.fieldName() );
+            if ( !frs.singleKey() ) {
+                string prefix = str::before( e.fieldName(), '.' );
+                if ( baseObjectNontrivialPrefixes.count( prefix ) > 0 ) {
+                    // A field with the same parent field has already been
+                    // constrainted, and with a multikey index we cannot
+                    // constrain this field.
+                    range = &frs.trivialRange();
+                } else {
+                    if ( range->nontrivial() ) {
+                        baseObjectNontrivialPrefixes.insert( prefix );
+                    }
+                }
+            }
+            int number = (int) e.number(); // returns 0.0 if not numeric
+            bool forward = ( ( number >= 0 ? 1 : -1 ) * ( direction >= 0 ? 1 : -1 ) > 0 );
+            if ( forward ) {
+                _ranges.push_back( *range );
+            }
+            else {
+                _ranges.push_back( FieldRange( BSONObj().firstElement(), frs.singleKey(), false, true ) );
+                range->reverse( _ranges.back() );
+            }
+            assert( !_ranges.back().empty() );
+        }
+        uassert( 13385, "combinatorial limit of $in partitioning of result set exceeded", size() < 1000000 );
+    }    
+
+    BSONObj FieldRangeVector::startKey() const {
+        BSONObjBuilder b;
+        for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            const FieldInterval &fi = i->intervals().front();
+            b.appendAs( fi._lower._bound, "" );
+        }
+        return b.obj();
+    }
+
+    BSONObj FieldRangeVector::endKey() const {
+        BSONObjBuilder b;
+        for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+            const FieldInterval &fi = i->intervals().back();
+            b.appendAs( fi._upper._bound, "" );
+        }
+        return b.obj();
+    }
+
+    BSONObj FieldRangeVector::obj() const {
+        BSONObjBuilder b;
+        BSONObjIterator k( _indexSpec.keyPattern );
+        for( int i = 0; i < (int)_ranges.size(); ++i ) {
+            BSONArrayBuilder a( b.subarrayStart( k.next().fieldName() ) );
+            for( vector<FieldInterval>::const_iterator j = _ranges[ i ].intervals().begin();
+                j != _ranges[ i ].intervals().end(); ++j ) {
+                a << BSONArray( BSON_ARRAY( j->_lower._bound << j->_upper._bound ).clientReadable() );
+            }
+            a.done();
+        }
+        return b.obj();
+    }
+    
+    FieldRange *FieldRangeSet::__singleKeyTrivialRange = 0;
+    FieldRange *FieldRangeSet::__multiKeyTrivialRange = 0;
+    const FieldRange &FieldRangeSet::trivialRange() const {
+        FieldRange *&ret = _singleKey ? __singleKeyTrivialRange : __multiKeyTrivialRange;
+        if ( ret == 0 ) {
+            ret = new FieldRange( BSONObj().firstElement(), _singleKey, false, true );
+        }
+        return *ret;
+    }
+
+    BSONObj FieldRangeSet::simplifiedQuery( const BSONObj &_fields ) const {
+        BSONObj fields = _fields;
+        if ( fields.isEmpty() ) {
+            BSONObjBuilder b;
+            for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) {
+                b.append( i->first, 1 );
+            }
+            fields = b.obj();
+        }
+        BSONObjBuilder b;
+        BSONObjIterator i( fields );
+        while( i.more() ) {
+            BSONElement e = i.next();
+            const char *name = e.fieldName();
+            const FieldRange &eRange = range( name );
+            assert( !eRange.empty() );
+            if ( eRange.equality() )
+                b.appendAs( eRange.min(), name );
+            else if ( eRange.nontrivial() ) {
+                BSONObj o;
+                BSONObjBuilder c;
+                if ( eRange.min().type() != MinKey )
+                    c.appendAs( eRange.min(), eRange.minInclusive() ? "$gte" : "$gt" );
+                if ( eRange.max().type() != MaxKey )
+                    c.appendAs( eRange.max(), eRange.maxInclusive() ? "$lte" : "$lt" );
+                o = c.obj();
+                b.append( name, o );
+            }
+        }
+        return b.obj();
+    }
+
+    QueryPattern FieldRangeSet::pattern( const BSONObj &sort ) const {
+        return QueryPattern( *this, sort );
+    }
+
+    // TODO get rid of this
+    BoundList FieldRangeSet::indexBounds( const BSONObj &keyPattern, int direction ) const {
+        typedef vector<pair<shared_ptr<BSONObjBuilder>, shared_ptr<BSONObjBuilder> > > BoundBuilders;
+        BoundBuilders builders;
+        builders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) );
+        BSONObjIterator i( keyPattern );
+        bool ineq = false; // until ineq is true, we are just dealing with equality and $in bounds
+        while( i.more() ) {
+            BSONElement e = i.next();
+            const FieldRange &fr = range( e.fieldName() );
+            int number = (int) e.number(); // returns 0.0 if not numeric
+            bool forward = ( ( number >= 0 ? 1 : -1 ) * ( direction >= 0 ? 1 : -1 ) > 0 );
+            if ( !ineq ) {
+                if ( fr.equality() ) {
+                    for( BoundBuilders::const_iterator j = builders.begin(); j != builders.end(); ++j ) {
+                        j->first->appendAs( fr.min(), "" );
+                        j->second->appendAs( fr.min(), "" );
+                    }
+                }
+                else {
+                    if ( !fr.inQuery() ) {
+                        ineq = true;
+                    }
+                    BoundBuilders newBuilders;
+                    const vector<FieldInterval> &intervals = fr.intervals();
+                    for( BoundBuilders::const_iterator i = builders.begin(); i != builders.end(); ++i ) {
+                        BSONObj first = i->first->obj();
+                        BSONObj second = i->second->obj();
+
+                        const unsigned maxCombinations = 4000000;
+                        if ( forward ) {
+                            for( vector<FieldInterval>::const_iterator j = intervals.begin(); j != intervals.end(); ++j ) {
+                                uassert( 13303, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < maxCombinations );
+                                newBuilders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) );
+                                newBuilders.back().first->appendElements( first );
+                                newBuilders.back().second->appendElements( second );
+                                newBuilders.back().first->appendAs( j->_lower._bound, "" );
+                                newBuilders.back().second->appendAs( j->_upper._bound, "" );
+                            }
+                        }
+                        else {
+                            for( vector<FieldInterval>::const_reverse_iterator j = intervals.rbegin(); j != intervals.rend(); ++j ) {
+                                uassert( 13304, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < maxCombinations );
+                                newBuilders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) );
+                                newBuilders.back().first->appendElements( first );
+                                newBuilders.back().second->appendElements( second );
+                                newBuilders.back().first->appendAs( j->_upper._bound, "" );
+                                newBuilders.back().second->appendAs( j->_lower._bound, "" );
+                            }
+                        }
+                    }
+                    builders = newBuilders;
+                }
+            }
+            else {
+                for( BoundBuilders::const_iterator j = builders.begin(); j != builders.end(); ++j ) {
+                    j->first->appendAs( forward ? fr.min() : fr.max(), "" );
+                    j->second->appendAs( forward ? fr.max() : fr.min(), "" );
+                }
+            }
+        }
+        BoundList ret;
+        for( BoundBuilders::const_iterator i = builders.begin(); i != builders.end(); ++i )
+            ret.push_back( make_pair( i->first->obj(), i->second->obj() ) );
+        return ret;
+    }
+
+    FieldRangeSet *FieldRangeSet::subset( const BSONObj &fields ) const {
+        FieldRangeSet *ret = new FieldRangeSet( _ns, BSONObj(), _singleKey, true );
+        BSONObjIterator i( fields );
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if ( range( e.fieldName() ).nontrivial() ) {
+                ret->range( e.fieldName() ) = range( e.fieldName() );
+            }
+        }
+        ret->_queries = _queries;
+        return ret;
+    }
+    
+    bool FieldRangeSetPair::noNontrivialRanges() const {
+        return _singleKey.matchPossible() && _singleKey.nNontrivialRanges() == 0 &&
+                 _multiKey.matchPossible() && _multiKey.nNontrivialRanges() == 0;
+    }
+    
+    FieldRangeSetPair &FieldRangeSetPair::operator&=( const FieldRangeSetPair &other ) {
+        _singleKey &= other._singleKey;
+        _multiKey &= other._multiKey;
+        return *this;
+    }
+
+    FieldRangeSetPair &FieldRangeSetPair::operator-=( const FieldRangeSet &scanned ) {
+        _singleKey -= scanned;
+        _multiKey -= scanned;
+        return *this;            
+    }    
+    
+    BSONObj FieldRangeSetPair::simplifiedQueryForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const {
+        return frsForIndex( d, idxNo ).simplifiedQuery( keyPattern );
+    }    
+    
+    void FieldRangeSetPair::assertValidIndex( const NamespaceDetails *d, int idxNo ) const {
+        massert( 14048, "FieldRangeSetPair invalid index specified", idxNo >= 0 && idxNo < d->nIndexes );   
+    }
+        
+    const FieldRangeSet &FieldRangeSetPair::frsForIndex( const NamespaceDetails* nsd, int idxNo ) const {
+        assertValidIndexOrNoIndex( nsd, idxNo );
+        if ( idxNo < 0 ) {
+            // An unindexed cursor cannot have a "single key" constraint.
+            return _multiKey;
+        }
+        return nsd->isMultikey( idxNo ) ? _multiKey : _singleKey;
+    }    
+        
+    bool FieldRangeVector::matchesElement( const BSONElement &e, int i, bool forward ) const {
+        bool eq;
+        int l = matchingLowElement( e, i, forward, eq );
+        return ( l % 2 == 0 ); // if we're inside an interval
+    }
+
+    // binary search for interval containing the specified element
+    // an even return value indicates that the element is contained within a valid interval
+    int FieldRangeVector::matchingLowElement( const BSONElement &e, int i, bool forward, bool &lowEquality ) const {
+        lowEquality = false;
+        int l = -1;
+        int h = _ranges[ i ].intervals().size() * 2;
+        while( l + 1 < h ) {
+            int m = ( l + h ) / 2;
+            BSONElement toCmp;
+            bool toCmpInclusive;
+            const FieldInterval &interval = _ranges[ i ].intervals()[ m / 2 ];
+            if ( m % 2 == 0 ) {
+                toCmp = interval._lower._bound;
+                toCmpInclusive = interval._lower._inclusive;
+            }
+            else {
+                toCmp = interval._upper._bound;
+                toCmpInclusive = interval._upper._inclusive;
+            }
+            int cmp = toCmp.woCompare( e, false );
+            if ( !forward ) {
+                cmp = -cmp;
+            }
+            if ( cmp < 0 ) {
+                l = m;
+            }
+            else if ( cmp > 0 ) {
+                h = m;
+            }
+            else {
+                if ( m % 2 == 0 ) {
+                    lowEquality = true;
+                }
+                int ret = m;
+                // if left match and inclusive, all good
+                // if left match and not inclusive, return right before left bound
+                // if right match and inclusive, return left bound
+                // if right match and not inclusive, return right bound
+                if ( ( m % 2 == 0 && !toCmpInclusive ) || ( m % 2 == 1 && toCmpInclusive ) ) {
+                    --ret;
+                }
+                return ret;
+            }
+        }
+        assert( l + 1 == h );
+        return l;
+    }
+
+    bool FieldRangeVector::matchesKey( const BSONObj &key ) const {
+        BSONObjIterator j( key );
+        BSONObjIterator k( _indexSpec.keyPattern );
+        for( int l = 0; l < (int)_ranges.size(); ++l ) {
+            int number = (int) k.next().number();
+            bool forward = ( number >= 0 ? 1 : -1 ) * ( _direction >= 0 ? 1 : -1 ) > 0;
+            if ( !matchesElement( j.next(), l, forward ) ) {
+                return false;
+            }
+        }
+        return true;
+    }
+    
+    bool FieldRangeVector::matches( const BSONObj &obj ) const {
+
+        bool ok = false;
+
+        // TODO The representation of matching keys could potentially be optimized
+        // more for the case at hand.  (For example, we can potentially consider
+        // fields individually instead of constructing several bson objects using
+        // multikey arrays.)  But getKeys() canonically defines the key set for a
+        // given object and for now we are using it as is.
+        BSONObjSet keys;
+        _indexSpec.getKeys( obj, keys );
+        for( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) {
+            if ( matchesKey( *i ) ) {
+                ok = true;
+                break;
+            }
+        }
+
+        LOG(5) << "FieldRangeVector::matches() returns " << ok << endl;
+
+        return ok;
+    }
+
+    BSONObj FieldRangeVector::firstMatch( const BSONObj &obj ) const {
+        // NOTE Only works in forward direction.
+        assert( _direction >= 0 );
+        BSONObjSet keys( BSONObjCmp( _indexSpec.keyPattern ) );
+        _indexSpec.getKeys( obj, keys );
+        for( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) {
+            if ( matchesKey( *i ) ) {
+                return *i;
+            }
+        }
+        return BSONObj();
+    }
+    
+    // TODO optimize more
+    int FieldRangeVectorIterator::advance( const BSONObj &curr ) {
+        BSONObjIterator j( curr );
+        BSONObjIterator o( _v._indexSpec.keyPattern );
+        // track first field for which we are not at the end of the valid values,
+        // since we may need to advance from the key prefix ending with this field
+        int latestNonEndpoint = -1;
+        // iterate over fields to determine appropriate advance method
+        for( int i = 0; i < (int)_i.size(); ++i ) {
+            if ( i > 0 && !_v._ranges[ i - 1 ].intervals()[ _i[ i - 1 ] ].equality() ) {
+                // if last bound was inequality, we don't know anything about where we are for this field
+                // TODO if possible avoid this certain cases when value in previous field of the previous
+                // key is the same as value of previous field in current key
+                setMinus( i );
+            }
+            bool eq = false;
+            BSONElement oo = o.next();
+            bool reverse = ( ( oo.number() < 0 ) ^ ( _v._direction < 0 ) );
+            BSONElement jj = j.next();
+            if ( _i[ i ] == -1 ) { // unknown position for this field, do binary search
+                bool lowEquality;
+                int l = _v.matchingLowElement( jj, i, !reverse, lowEquality );
+                if ( l % 2 == 0 ) { // we are in a valid range for this field
+                    _i[ i ] = l / 2;
+                    int diff = (int)_v._ranges[ i ].intervals().size() - _i[ i ];
+                    if ( diff > 1 ) {
+                        latestNonEndpoint = i;
+                    }
+                    else if ( diff == 1 ) {
+                        int x = _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._bound.woCompare( jj, false );
+                        if ( x != 0 ) {
+                            latestNonEndpoint = i;
+                        }
+                    }
+                    continue;
+                }
+                else {   // not in a valid range for this field - determine if and how to advance
+                    // check if we're after the last interval for this field
+                    if ( l == (int)_v._ranges[ i ].intervals().size() * 2 - 1 ) {
+                        if ( latestNonEndpoint == -1 ) {
+                            return -2;
+                        }
+                        setZero( latestNonEndpoint + 1 );
+                        // skip to curr / latestNonEndpoint + 1 / superlative
+                        _after = true;
+                        return latestNonEndpoint + 1;
+                    }
+                    _i[ i ] = ( l + 1 ) / 2;
+                    if ( lowEquality ) {
+                        // skip to curr / i + 1 / superlative
+                        _after = true;
+                        return i + 1;
+                    }
+                    // skip to curr / i / nextbounds
+                    _cmp[ i ] = &_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound;
+                    _inc[ i ] = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive;
+                    for( int j = i + 1; j < (int)_i.size(); ++j ) {
+                        _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound;
+                        _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive;
+                    }
+                    _after = false;
+                    return i;
+                }
+            }
+            bool first = true;
+            // _i[ i ] != -1, so we have a starting interval for this field
+            // which serves as a lower/equal bound on the first iteration -
+            // we advance from this interval to find a matching interval
+            while( _i[ i ] < (int)_v._ranges[ i ].intervals().size() ) {
+                // compare to current interval's upper bound
+                int x = _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._bound.woCompare( jj, false );
+                if ( reverse ) {
+                    x = -x;
+                }
+                if ( x == 0 && _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._inclusive ) {
+                    eq = true;
+                    break;
+                }
+                // see if we're less than the upper bound
+                if ( x > 0 ) {
+                    if ( i == 0 && first ) {
+                        // the value of 1st field won't go backward, so don't check lower bound
+                        // TODO maybe we can check first only?
+                        break;
+                    }
+                    // if it's an equality interval, don't need to compare separately to lower bound
+                    if ( !_v._ranges[ i ].intervals()[ _i[ i ] ].equality() ) {
+                        // compare to current interval's lower bound
+                        x = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound.woCompare( jj, false );
+                        if ( reverse ) {
+                            x = -x;
+                        }
+                    }
+                    // if we're equal to and not inclusive the lower bound, advance
+                    if ( ( x == 0 && !_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive ) ) {
+                        setZero( i + 1 );
+                        // skip to curr / i + 1 / superlative
+                        _after = true;
+                        return i + 1;
+                    }
+                    // if we're less than the lower bound, advance
+                    if ( x > 0 ) {
+                        setZero( i + 1 );
+                        // skip to curr / i / nextbounds
+                        _cmp[ i ] = &_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound;
+                        _inc[ i ] = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive;
+                        for( int j = i + 1; j < (int)_i.size(); ++j ) {
+                            _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound;
+                            _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive;
+                        }
+                        _after = false;
+                        return i;
+                    }
+                    else {
+                        break;
+                    }
+                }
+                // we're above the upper bound, so try next interval and reset remaining fields
+                ++_i[ i ];
+                setZero( i + 1 );
+                first = false;
+            }
+            int diff = (int)_v._ranges[ i ].intervals().size() - _i[ i ];
+            if ( diff > 1 || ( !eq && diff == 1 ) ) {
+                // check if we're not at the end of valid values for this field
+                latestNonEndpoint = i;
+            }
+            else if ( diff == 0 ) {   // check if we're past the last interval for this field
+                if ( latestNonEndpoint == -1 ) {
+                    return -2;
+                }
+                // more values possible, skip...
+                setZero( latestNonEndpoint + 1 );
+                // skip to curr / latestNonEndpoint + 1 / superlative
+                _after = true;
+                return latestNonEndpoint + 1;
+            }
+        }
+        return -1;
+    }
+
+    void FieldRangeVectorIterator::prepDive() {
+        for( int j = 0; j < (int)_i.size(); ++j ) {
+            _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound;
+            _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive;
+        }
+    }
+
+    BSONObj FieldRangeVectorIterator::startKey() {
+        BSONObjBuilder b;
+        for( int unsigned i = 0; i < _i.size(); ++i ) {
+            const FieldInterval &fi = _v._ranges[ i ].intervals()[ _i[ i ] ];
+            b.appendAs( fi._lower._bound, "" );
+        }
+        return b.obj();
+    }
+
+    // temp
+    BSONObj FieldRangeVectorIterator::endKey() {
+        BSONObjBuilder b;
+        for( int unsigned i = 0; i < _i.size(); ++i ) {
+            const FieldInterval &fi = _v._ranges[ i ].intervals()[ _i[ i ] ];
+            b.appendAs( fi._upper._bound, "" );
+        }
+        return b.obj();
+    }
+    
+    OrRangeGenerator::OrRangeGenerator( const char *ns, const BSONObj &query , bool optimize )
+    : _baseSet( ns, query, optimize ), _orFound() {
+        
+        BSONObjIterator i( _baseSet.originalQuery() );
+        
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if ( strcmp( e.fieldName(), "$or" ) == 0 ) {
+                uassert( 13262, "$or requires nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 );
+                BSONObjIterator j( e.embeddedObject() );
+                while( j.more() ) {
+                    BSONElement f = j.next();
+                    uassert( 13263, "$or array must contain objects", f.type() == Object );
+                    _orSets.push_back( FieldRangeSetPair( ns, f.embeddedObject(), optimize ) );
+                    uassert( 13291, "$or may not contain 'special' query", _orSets.back().getSpecial().empty() );
+                    _originalOrSets.push_back( _orSets.back() );
+                }
+                _orFound = true;
+                continue;
+            }
+        }
+    }
+
+    void OrRangeGenerator::assertMayPopOrClause() {
+        massert( 13274, "no or clause to pop", !orFinished() );        
+    }
+    
+    void OrRangeGenerator::popOrClause( NamespaceDetails *nsd, int idxNo, const BSONObj &keyPattern ) {
+        assertMayPopOrClause();
+        auto_ptr<FieldRangeSet> holder;
+        const FieldRangeSet *toDiff = &_originalOrSets.front().frsForIndex( nsd, idxNo );
+        BSONObj indexSpec = keyPattern;
+        if ( !indexSpec.isEmpty() && toDiff->matchPossibleForIndex( indexSpec ) ) {
+            holder.reset( toDiff->subset( indexSpec ) );
+            toDiff = holder.get();
+        }
+        popOrClause( toDiff, nsd, idxNo, keyPattern );
+    }
+    
+    void OrRangeGenerator::popOrClauseSingleKey() {
+        assertMayPopOrClause();
+        FieldRangeSet *toDiff = &_originalOrSets.front()._singleKey;
+        popOrClause( toDiff );
+    }
+    
+    /**
+     * Removes the top or clause, which would have been recently scanned, and
+     * removes the field ranges it covers from all subsequent or clauses.  As a
+     * side effect, this function may invalidate the return values of topFrs()
+     * calls made before this function was called.
+     * @param indexSpec - Keys of the index that was used to satisfy the last or
+     * clause.  Used to determine the range of keys that were scanned.  If
+     * empty we do not constrain the previous clause's ranges using index keys,
+     * which may reduce opportunities for range elimination.
+     */
+    void OrRangeGenerator::popOrClause( const FieldRangeSet *toDiff, NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) {
+        list<FieldRangeSetPair>::iterator i = _orSets.begin();
+        list<FieldRangeSetPair>::iterator j = _originalOrSets.begin();
+        ++i;
+        ++j;
+        while( i != _orSets.end() ) {
+            *i -= *toDiff;
+            // Check if match is possible at all, and if it is possible for the recently scanned index.
+            if( !i->matchPossible() || ( d && !i->matchPossibleForIndex( d, idxNo, keyPattern ) ) ) {
+                i = _orSets.erase( i );
+                j = _originalOrSets.erase( j );
+            }
+            else {
+                ++i;
+                ++j;
+            }
+        }
+        _oldOrSets.push_front( _orSets.front() );
+        _orSets.pop_front();
+        _originalOrSets.pop_front();
+    }
+    
+    struct SimpleRegexUnitTest : UnitTest {
+        void run() {
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^foo");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "foo" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^f?oo");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^fz?oo");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "f" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^f", "");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "f" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "\\Af", "");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "f" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "^f", "m");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "\\Af", "m");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "f" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "\\Af", "mi");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "" );
+            }
+            {
+                BSONObjBuilder b;
+                b.appendRegex("r", "\\Af \t\vo\n\ro  \\ \\# #comment", "mx");
+                BSONObj o = b.done();
+                assert( simpleRegex(o.firstElement()) == "foo #" );
+            }
+            {
+                assert( simpleRegex("^\\Qasdf\\E", "", NULL) == "asdf" );
+                assert( simpleRegex("^\\Qasdf\\E.*", "", NULL) == "asdf" );
+                assert( simpleRegex("^\\Qasdf", "", NULL) == "asdf" ); // PCRE supports this
+                assert( simpleRegex("^\\Qasdf\\\\E", "", NULL) == "asdf\\" );
+                assert( simpleRegex("^\\Qas.*df\\E", "", NULL) == "as.*df" );
+                assert( simpleRegex("^\\Qas\\Q[df\\E", "", NULL) == "as\\Q[df" );
+                assert( simpleRegex("^\\Qas\\E\\\\E\\Q$df\\E", "", NULL) == "as\\E$df" ); // quoted string containing \E
+            }
+
+        }
+    } simple_regex_unittest;
+
+
+    long long applySkipLimit( long long num , const BSONObj& cmd ) {
+        BSONElement s = cmd["skip"];
+        BSONElement l = cmd["limit"];
+
+        if ( s.isNumber() ) {
+            num = num - s.numberLong();
+            if ( num < 0 ) {
+                num = 0;
+            }
+        }
+
+        if ( l.isNumber() ) {
+            long long limit = l.numberLong();
+            if ( limit < num ) {
+                num = limit;
+            }
+        }
+
+        return num;
+    }
+
+
+} // namespace mongo
diff --git a/src/mongo/db/queryutil.h b/src/mongo/db/queryutil.h
new file mode 100644
index 00000000000..aefef27cc8b
--- /dev/null
+++ b/src/mongo/db/queryutil.h
@@ -0,0 +1,443 @@
+// @file queryutil.h - Utility classes representing ranges of valid BSONElement values for a query.
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include "jsobj.h"
+#include "indexkey.h"
+
+namespace mongo {
+
+    /**
+     * One side of an interval of valid BSONElements, specified by a value and a
+     * boolean indicating whether the interval includes the value.
+     */
+    struct FieldBound {
+        BSONElement _bound;
+        bool _inclusive;
+        bool operator==( const FieldBound &other ) const {
+            return _bound.woCompare( other._bound ) == 0 &&
+                   _inclusive == other._inclusive;
+        }
+        void flipInclusive() { _inclusive = !_inclusive; }
+    };
+
+    /** A closed interval composed of a lower and an upper FieldBound. */
+    struct FieldInterval {
+        FieldInterval() : _cachedEquality( -1 ) {}
+        FieldInterval( const BSONElement& e ) : _cachedEquality( -1 ) {
+            _lower._bound = _upper._bound = e;
+            _lower._inclusive = _upper._inclusive = true;
+        }
+        FieldBound _lower;
+        FieldBound _upper;
+        /** @return true iff no single element can be contained in the interval. */
+        bool strictValid() const {
+            int cmp = _lower._bound.woCompare( _upper._bound, false );
+            return ( cmp < 0 || ( cmp == 0 && _lower._inclusive && _upper._inclusive ) );
+        }
+        /** @return true iff the interval is an equality constraint. */
+        bool equality() const;
+        mutable int _cachedEquality;
+
+        string toString() const;
+    };
+
+    /**
+     * An ordered list of FieldIntervals expressing constraints on valid
+     * BSONElement values for a field.
+     */
+    class FieldRange {
+    public:
+        FieldRange( const BSONElement &e , bool singleKey , bool isNot=false , bool optimize=true );
+
+        /** @return Range intersection with 'other'. */
+        const FieldRange &operator&=( const FieldRange &other );
+        /** @return Range union with 'other'. */
+        const FieldRange &operator|=( const FieldRange &other );
+        /** @return Range of elements elements included in 'this' but not 'other'. */
+        const FieldRange &operator-=( const FieldRange &other );
+        /** @return true iff this range is a subset of 'other'. */
+        bool operator<=( const FieldRange &other ) const;
+
+        /**
+         * If there are any valid values for this range, the extreme values can
+         * be extracted.
+         */
+        
+        BSONElement min() const { assert( !empty() ); return _intervals[ 0 ]._lower._bound; }
+        BSONElement max() const { assert( !empty() ); return _intervals[ _intervals.size() - 1 ]._upper._bound; }
+        bool minInclusive() const { assert( !empty() ); return _intervals[ 0 ]._lower._inclusive; }
+        bool maxInclusive() const { assert( !empty() ); return _intervals[ _intervals.size() - 1 ]._upper._inclusive; }
+
+        /** @return true iff this range expresses a single equality interval. */
+        bool equality() const;
+        /** @return true if all the intervals for this range are equalities */
+        bool inQuery() const;
+        /** @return true iff this range does not include every BSONElement */
+        bool nontrivial() const;
+        /** @return true iff this range matches no BSONElements. */
+        bool empty() const { return _intervals.empty(); }
+        
+        /** Empty the range so it matches no BSONElements. */
+        void makeEmpty() { _intervals.clear(); }
+        const vector<FieldInterval> &intervals() const { return _intervals; }
+        string getSpecial() const { return _special; }
+        /** Make component intervals noninclusive. */
+        void setExclusiveBounds();
+        /**
+         * Constructs a range where all FieldIntervals and FieldBounds are in
+         * the opposite order of the current range.
+         * NOTE the resulting intervals might not be strictValid().
+         */
+        void reverse( FieldRange &ret ) const;
+
+        string toString() const;
+    private:
+        BSONObj addObj( const BSONObj &o );
+        void finishOperation( const vector<FieldInterval> &newIntervals, const FieldRange &other );
+        vector<FieldInterval> _intervals;
+        // Owns memory for our BSONElements.
+        vector<BSONObj> _objData;
+        string _special;
+        bool _singleKey;
+    };
+
+    /**
+     * A BoundList contains intervals specified by inclusive start
+     * and end bounds.  The intervals should be nonoverlapping and occur in
+     * the specified direction of traversal.  For example, given a simple index {i:1}
+     * and direction +1, one valid BoundList is: (1, 2); (4, 6).  The same BoundList
+     * would be valid for index {i:-1} with direction -1.
+     */
+    typedef vector<pair<BSONObj,BSONObj> > BoundList;
+
+    class QueryPattern;
+    
+    /**
+     * A set of FieldRanges determined from constraints on the fields of a query,
+     * that may be used to determine index bounds.
+     */
+    class FieldRangeSet {
+    public:
+        friend class OrRangeGenerator;
+        friend class FieldRangeVector;
+        FieldRangeSet( const char *ns, const BSONObj &query , bool singleKey , bool optimize=true );
+        
+        /** @return true if there is a nontrivial range for the given field. */
+        bool hasRange( const char *fieldName ) const {
+            map<string, FieldRange>::const_iterator f = _ranges.find( fieldName );
+            return f != _ranges.end();
+        }
+        /** @return range for the given field. */
+        const FieldRange &range( const char *fieldName ) const;
+        /** @return range for the given field. */
+        FieldRange &range( const char *fieldName );
+        /** @return the number of nontrivial ranges. */
+        int nNontrivialRanges() const;
+        /** @return the field ranges comprising this set. */
+        const map<string,FieldRange> &ranges() const { return _ranges; }
+        /** 
+         * @return true if a match could be possible on every field. Generally this
+         * is not useful information for a single key FieldRangeSet and
+         * matchPossibleForIndex() should be used instead.
+         */
+        bool matchPossible() const;
+        /**
+         * @return true if a match could be possible given the value of _singleKey
+         * and index key 'keyPattern'.
+         * @param keyPattern May be {} or {$natural:1} for a non index scan.
+         */
+        bool matchPossibleForIndex( const BSONObj &keyPattern ) const;
+        
+        const char *ns() const { return _ns; }
+        
+        /**
+         * @return a simplified query from the extreme values of the nontrivial
+         * fields.
+         * @param fields If specified, the fields of the returned object are
+         * ordered to match those of 'fields'.
+         */
+        BSONObj simplifiedQuery( const BSONObj &fields = BSONObj() ) const;
+        
+        QueryPattern pattern( const BSONObj &sort = BSONObj() ) const;
+        string getSpecial() const;
+
+        /**
+         * @return a FieldRangeSet approximation of the documents in 'this' but
+         * not in 'other'.  The approximation will be a superset of the documents
+         * in 'this' but not 'other'.
+         */
+        const FieldRangeSet &operator-=( const FieldRangeSet &other );
+        /** @return intersection of 'this' with 'other'. */
+        const FieldRangeSet &operator&=( const FieldRangeSet &other );
+        
+        /**
+         * @return an ordered list of bounds generated using an index key pattern
+         * and traversal direction.
+         *
+         * NOTE This function is deprecated in the query optimizer and only
+         * currently used by the sharding code.
+         */
+        BoundList indexBounds( const BSONObj &keyPattern, int direction ) const;
+
+        /**
+         * @return - A new FieldRangeSet based on this FieldRangeSet, but with only
+         * a subset of the fields.
+         * @param fields - Only fields which are represented as field names in this object
+         * will be included in the returned FieldRangeSet.
+         */
+        FieldRangeSet *subset( const BSONObj &fields ) const;
+        
+        bool singleKey() const { return _singleKey; }
+        
+        BSONObj originalQuery() const { return _queries[ 0 ]; }
+    private:
+        void appendQueries( const FieldRangeSet &other );
+        void makeEmpty();
+        void processQueryField( const BSONElement &e, bool optimize );
+        void processOpElement( const char *fieldName, const BSONElement &f, bool isNot, bool optimize );
+        static FieldRange *__singleKeyTrivialRange;
+        static FieldRange *__multiKeyTrivialRange;
+        const FieldRange &trivialRange() const;
+        map<string,FieldRange> _ranges;
+        const char *_ns;
+        // Owns memory for FieldRange BSONElements.
+        vector<BSONObj> _queries;
+        bool _singleKey;
+    };
+
+    class NamespaceDetails;
+    
+    /**
+     * A pair of FieldRangeSets, one representing constraints for single key
+     * indexes and the other representing constraints for multi key indexes and
+     * unindexed scans.  In several member functions the caller is asked to
+     * supply an index so that the implementation may utilize the proper
+     * FieldRangeSet and return results that are appropriate with respect to that
+     * supplied index.
+     */
+    class FieldRangeSetPair {
+    public:
+        FieldRangeSetPair( const char *ns, const BSONObj &query, bool optimize=true )
+        :_singleKey( ns, query, true, optimize ), _multiKey( ns, query, false, optimize ) {}
+
+        /**
+         * @return the appropriate single or multi key FieldRangeSet for the specified index.
+         * @param idxNo -1 for non index scan.
+         */
+        const FieldRangeSet &frsForIndex( const NamespaceDetails* nsd, int idxNo ) const;
+
+        /** @return a field range in the single key FieldRangeSet. */
+        const FieldRange &singleKeyRange( const char *fieldName ) const {
+            return _singleKey.range( fieldName );
+        }
+        /** @return true if the range limits are equivalent to an empty query. */
+        bool noNontrivialRanges() const;
+        /** @return false if a match is impossible regardless of index. */
+        bool matchPossible() const { return _multiKey.matchPossible(); }
+        /**
+         * @return false if a match is impossible on the specified index.
+         * @param idxNo -1 for non index scan.
+         */
+        bool matchPossibleForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const;
+        
+        const char *ns() const { return _singleKey.ns(); }
+
+        string getSpecial() const { return _singleKey.getSpecial(); }
+
+        /** Intersect with another FieldRangeSetPair. */
+        FieldRangeSetPair &operator&=( const FieldRangeSetPair &other );
+        /**
+         * Subtract a FieldRangeSet, generally one expressing a range that has
+         * already been scanned.
+         */
+        FieldRangeSetPair &operator-=( const FieldRangeSet &scanned );
+
+        BoundList singleKeyIndexBounds( const BSONObj &keyPattern, int direction ) const {
+            return _singleKey.indexBounds( keyPattern, direction );
+        }
+        
+        BSONObj originalQuery() const { return _singleKey.originalQuery(); }
+
+    private:
+        FieldRangeSetPair( const FieldRangeSet &singleKey, const FieldRangeSet &multiKey )
+        :_singleKey( singleKey ), _multiKey( multiKey ) {}
+        void assertValidIndex( const NamespaceDetails *d, int idxNo ) const;
+        void assertValidIndexOrNoIndex( const NamespaceDetails *d, int idxNo ) const;
+        /** matchPossibleForIndex() must be true. */
+        BSONObj simplifiedQueryForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const;        
+        FieldRangeSet _singleKey;
+        FieldRangeSet _multiKey;
+        friend class OrRangeGenerator;
+        friend struct QueryUtilIndexed;
+    };
+    
+    class IndexSpec;
+
+    /**
+     * An ordered list of fields and their FieldRanges, corresponding to valid
+     * index keys for a given index spec.
+     */
+    class FieldRangeVector {
+    public:
+        /**
+         * @param frs The valid ranges for all fields, as defined by the query spec
+         * @param indexSpec The index spec (key pattern and info)
+         * @param direction The direction of index traversal
+         */
+        FieldRangeVector( const FieldRangeSet &frs, const IndexSpec &indexSpec, int direction );
+
+        /** @return the number of index ranges represented by 'this' */
+        long long size();
+        /** @return starting point for an index traversal. */
+        BSONObj startKey() const;
+        /** @return end point for an index traversal. */
+        BSONObj endKey() const;
+        /** @return a client readable representation of 'this' */
+        BSONObj obj() const;
+        
+        const IndexSpec& getSpec(){ return _indexSpec; }
+
+        /**
+         * @return true iff the provided document matches valid ranges on all
+         * of this FieldRangeVector's fields, which is the case iff this document
+         * would be returned while scanning the index corresponding to this
+         * FieldRangeVector.  This function is used for $or clause deduping.
+         */
+        bool matches( const BSONObj &obj ) const;
+        
+        /**
+         * @return first key of 'obj' that would be encountered by a forward
+         * index scan using this FieldRangeVector, BSONObj() if no such key.
+         */
+        BSONObj firstMatch( const BSONObj &obj ) const;
+        
+    private:
+        int matchingLowElement( const BSONElement &e, int i, bool direction, bool &lowEquality ) const;
+        bool matchesElement( const BSONElement &e, int i, bool direction ) const;
+        bool matchesKey( const BSONObj &key ) const;
+        vector<FieldRange> _ranges;
+        const IndexSpec _indexSpec;
+        int _direction;
+        vector<BSONObj> _queries; // make sure mem owned
+        friend class FieldRangeVectorIterator;
+    };
+    
+    /**
+     * Helper class for iterating through an ordered representation of keys
+     * to find those keys that match a specified FieldRangeVector.
+     */
+    class FieldRangeVectorIterator {
+    public:
+        FieldRangeVectorIterator( const FieldRangeVector &v ) : _v( v ), _i( _v._ranges.size(), -1 ), _cmp( _v._ranges.size(), 0 ), _inc( _v._ranges.size(), false ), _after() {
+        }
+        static BSONObj minObject() {
+            BSONObjBuilder b; b.appendMinKey( "" );
+            return b.obj();
+        }
+        static BSONObj maxObject() {
+            BSONObjBuilder b; b.appendMaxKey( "" );
+            return b.obj();
+        }
+        /**
+         * @return Suggested advance method, based on current key.
+         *   -2 Iteration is complete, no need to advance.
+         *   -1 Advance to the next key, without skipping.
+         *  >=0 Skip parameter.  If @return is r, skip to the key comprised
+         *      of the first r elements of curr followed by the (r+1)th and
+         *      remaining elements of cmp() (with inclusivity specified by
+         *      the (r+1)th and remaining elements of inc()).  If after() is
+         *      true, skip past this key not to it.
+         */
+        int advance( const BSONObj &curr );
+        const vector<const BSONElement *> &cmp() const { return _cmp; }
+        const vector<bool> &inc() const { return _inc; }
+        bool after() const { return _after; }
+        void prepDive();
+        void setZero( int i ) { for( int j = i; j < (int)_i.size(); ++j ) _i[ j ] = 0; }
+        void setMinus( int i ) { for( int j = i; j < (int)_i.size(); ++j ) _i[ j ] = -1; }
+        bool ok() { return _i[ 0 ] < (int)_v._ranges[ 0 ].intervals().size(); }
+        BSONObj startKey();
+        // temp
+        BSONObj endKey();
+    private:
+        const FieldRangeVector &_v;
+        vector<int> _i;
+        vector<const BSONElement*> _cmp;
+        vector<bool> _inc;
+        bool _after;
+    };
+    
+    /**
+     * As we iterate through $or clauses this class generates a FieldRangeSetPair
+     * for the current $or clause, in some cases by excluding ranges that were
+     * included in a previous clause.
+     */
+    class OrRangeGenerator {
+    public:
+        OrRangeGenerator( const char *ns, const BSONObj &query , bool optimize=true );
+
+        /**
+         * @return true iff we are done scanning $or clauses.  if there's a
+         * useless or clause, we won't use or index ranges to help with scanning.
+         */
+        bool orFinished() const { return _orFound && _orSets.empty(); }
+        /** Iterates to the next $or clause by removing the current $or clause. */
+        void popOrClause( NamespaceDetails *nsd, int idxNo, const BSONObj &keyPattern );
+        void popOrClauseSingleKey();
+        /** @return FieldRangeSetPair for the current $or clause. */
+        FieldRangeSetPair *topFrsp() const;
+        /**
+         * @return original FieldRangeSetPair for the current $or clause. While the
+         * original bounds are looser, they are composed of fewer ranges and it
+         * is faster to do operations with them; when they can be used instead of
+         * more precise bounds, they should.
+         */
+        FieldRangeSetPair *topFrspOriginal() const;
+        
+        string getSpecial() const { return _baseSet.getSpecial(); }
+
+        bool moreOrClauses() const { return !_orSets.empty(); }
+    private:
+        void assertMayPopOrClause();
+        void popOrClause( const FieldRangeSet *toDiff, NamespaceDetails *d = 0, int idxNo = -1, const BSONObj &keyPattern = BSONObj() );
+        FieldRangeSetPair _baseSet;
+        list<FieldRangeSetPair> _orSets;
+        list<FieldRangeSetPair> _originalOrSets;
+        // ensure memory is owned
+        list<FieldRangeSetPair> _oldOrSets;
+        bool _orFound;
+        friend struct QueryUtilIndexed;
+    };
+
+    /** returns a string that when used as a matcher, would match a super set of regex()
+        returns "" for complex regular expressions
+        used to optimize queries in some simple regex cases that start with '^'
+
+        if purePrefix != NULL, sets it to whether the regex can be converted to a range query
+    */
+    string simpleRegex(const char* regex, const char* flags, bool* purePrefix=NULL);
+
+    /** returns the upper bound of a query that matches prefix */
+    string simpleRegexEnd( string prefix );
+
+    long long applySkipLimit( long long num , const BSONObj& cmd );
+
+} // namespace mongo
+
+#include "queryutil-inl.h"
diff --git a/src/mongo/db/record.cpp b/src/mongo/db/record.cpp
new file mode 100644
index 00000000000..17987002efc
--- /dev/null
+++ b/src/mongo/db/record.cpp
@@ -0,0 +1,267 @@
+// record.cpp
+
+#include "pch.h"
+#include "pdfile.h"
+#include "../util/processinfo.h"
+#include "../util/net/listen.h"
+#include "pagefault.h"
+
+namespace mongo {
+
+    namespace ps {
+        
+        enum State {
+            In , Out, Unk
+        };
+
+        enum Constants {
+            SliceSize = 65536 , 
+            MaxChain = 20 , // intentionally very low
+            NumSlices = 10 ,
+            RotateTimeSecs = 90 
+        };
+        
+        int hash( size_t region ) {
+            return 
+                abs( ( ( 7 + (int)(region & 0xFFFF) ) 
+                       * ( 11 + (int)( ( region >> 16 ) & 0xFFFF ) ) 
+#if defined(_WIN64) || defined(__amd64__)
+                       * ( 13 + (int)( ( region >> 32 ) & 0xFFFF ) )
+                       * ( 17 + (int)( ( region >> 48 ) & 0xFFFF ) )
+#endif
+                       ) % SliceSize );
+        }
+        
+                
+        /**
+         * simple hash map for region -> status
+         * this constitures a single region of time
+         * it does chaining, but very short chains
+         */
+        class Slice {
+            
+            struct Entry {
+                size_t region;
+                unsigned long long value;
+            };
+
+        public:
+            
+            Slice() {
+                reset();
+            }
+            
+            void reset() {
+                memset( _data , 0 , SliceSize * sizeof(Entry) );
+            }
+
+            State get( int regionHash , size_t region  , short offset ) {
+                DEV assert( hash( region ) == regionHash );
+                
+                Entry * e = _get( regionHash , region , false );
+                if ( ! e )
+                    return Unk;
+                
+                return ( e->value & ( ((unsigned long long)1) << offset ) ) ? In : Out;
+            }
+            
+            /**
+             * @return true if added, false if full
+             */
+            bool in( int regionHash , size_t region , short offset ) {
+                DEV assert( hash( region ) == regionHash );
+                
+                Entry * e = _get( regionHash , region , true );
+                if ( ! e )
+                    return false;
+                
+                e->value |= ((unsigned long long)1) << offset;
+                return true;
+            }
+
+        private:
+
+            Entry* _get( int start , size_t region , bool add ) {
+                for ( int i=0; i<MaxChain; i++ ) {
+
+                    int bucket = ( start + i ) % SliceSize;
+                    
+                    if ( _data[bucket].region == 0 ) {
+                        if ( ! add ) 
+                            return 0;
+
+                        _data[bucket].region = region;
+                        return &_data[bucket];
+                    }
+                    
+                    if ( _data[bucket].region == region ) {
+                        return &_data[bucket];
+                    }
+                }
+                return 0;
+            }
+
+            Entry _data[SliceSize];
+        };
+        
+        
+        /**
+         * this contains many slices of times
+         * the idea you put mem status in the current time slice
+         * and then after a certain period of time, it rolls off so we check again
+         */
+        class Rolling {
+            
+        public:
+            Rolling() 
+                : _lock( "ps::Rolling" ){
+                _curSlice = 0;
+                _lastRotate = Listener::getElapsedTimeMillis();
+            }
+            
+
+            /**
+             * after this call, we assume the page is in ram
+             * @param doHalf if this is a known good access, want to put in first half
+             * @return whether we know the page is in ram
+             */
+            bool access( size_t region , short offset , bool doHalf ) {
+                int regionHash = hash(region);
+                
+                SimpleMutex::scoped_lock lk( _lock );
+
+                static int rarely_count = 0;
+                if ( rarely_count++ % 2048 == 0 ) {
+                    long long now = Listener::getElapsedTimeMillis();
+                    RARELY if ( now == 0 ) {
+                        tlog() << "warning Listener::getElapsedTimeMillis returning 0ms" << endl;
+                    }
+                    
+                    if ( now - _lastRotate > ( 1000 * RotateTimeSecs ) ) {
+                        _rotate();
+                    }
+                }
+                
+                for ( int i=0; i<NumSlices / ( doHalf ? 2 : 1 ); i++ ) {
+                    int pos = (_curSlice+i)%NumSlices;
+                    State s = _slices[pos].get( regionHash , region , offset );
+
+                    if ( s == In )
+                        return true;
+                    
+                    if ( s == Out ) {
+                        _slices[pos].in( regionHash , region , offset );
+                        return false;
+                    }
+                }
+
+                // we weren't in any slice
+                // so add to cur
+                if ( ! _slices[_curSlice].in( regionHash , region , offset ) ) {
+                    _rotate();
+                    _slices[_curSlice].in( regionHash , region , offset );
+                }
+                return false;
+            }
+            
+        private:
+            
+            void _rotate() {
+                _curSlice = ( _curSlice + 1 ) % NumSlices;
+                _slices[_curSlice].reset();
+                _lastRotate = Listener::getElapsedTimeMillis();
+            }
+
+            int _curSlice;
+            long long _lastRotate;
+            Slice _slices[NumSlices];
+
+            SimpleMutex _lock;
+        } rolling;
+        
+    }
+
+    bool Record::MemoryTrackingEnabled = true;
+    
+    volatile int __record_touch_dummy = 1; // this is used to make sure the compiler doesn't get too smart on us
+    void Record::touch( bool entireRecrd ) {
+        if ( lengthWithHeaders > HeaderSize ) { // this also makes sure lengthWithHeaders is in memory
+            char * addr = data;
+            char * end = data + netLength();
+            for ( ; addr <= end ; addr += 2048 ) {
+                __record_touch_dummy += addr[0];
+
+                break; // TODO: remove this, pending SERVER-3711
+                
+                // note if this is a touch of a deletedrecord, we don't want to touch more than the first part. we may simply
+                // be updated the linked list and a deletedrecord could be gigantic.  similar circumstance just less extreme 
+                // exists for any record if we are just updating its header, say on a remove(); some sort of hints might be 
+                // useful.
+
+                if ( ! entireRecrd )
+                    break;
+            }
+        }
+    }
+
+    const bool blockSupported = ProcessInfo::blockCheckSupported();
+
+    bool Record::likelyInPhysicalMemory() {
+        if ( ! MemoryTrackingEnabled )
+            return true;
+
+        const size_t page = (size_t)data >> 12;
+        const size_t region = page >> 6;
+        const size_t offset = page & 0x3f;
+        
+        if ( ps::rolling.access( region , offset , false ) )
+            return true;
+
+        if ( ! blockSupported ) {
+            // this means we don't fallback to system call 
+            // and assume things aren't in memory
+            // possible we yield too much - but better than not yielding through a fault
+            return false;
+        }
+
+        return ProcessInfo::blockInMemory( data );
+    }
+
+
+    Record* Record::accessed() {
+        const size_t page = (size_t)data >> 12;
+        const size_t region = page >> 6;
+        const size_t offset = page & 0x3f;        
+        ps::rolling.access( region , offset , true );
+        return this;
+    }
+    
+    Record* DiskLoc::rec() const {
+        Record *r = DataFileMgr::getRecord(*this);
+#if defined(_PAGEFAULTEXCEPTION)
+        DEV ONCE { 
+            log() << "_DEBUG info _PAGEFAULTEXCEPTION is ON -- experimental at this time" << endl;
+        }
+        bool fault = !r->likelyInPhysicalMemory();
+        DEV if( rand() % 100 == 0 ) 
+            fault = true;
+        if( fault &&
+            !cc()._hasWrittenThisPass &&
+            cc()._pageFaultRetryableSection ) 
+        {
+            if( cc()._pageFaultRetryableSection->_laps > 100 ) { 
+                log() << "info pagefaultexception _laps > 100" << endl;
+            }
+            else {
+                throw PageFaultException(r);
+            }
+        }
+#else 
+        DEV ONCE { 
+            log() << "_DEBUG info _PAGEFAULTEXCEPTION is off" << endl;
+        }
+#endif
+        return r;
+    }
+
+}
diff --git a/src/mongo/db/repl.cpp b/src/mongo/db/repl.cpp
new file mode 100644
index 00000000000..25ecb6b455f
--- /dev/null
+++ b/src/mongo/db/repl.cpp
@@ -0,0 +1,1516 @@
+// repl.cpp
+
+/* TODO
+   PAIRING
+    _ on a syncexception, don't allow going back to master state?
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Collections we use:
+
+   local.sources         - indicates what sources we pull from as a "slave", and the last update of each
+   local.oplog.$main     - our op log as "master"
+   local.dbinfo.<dbname> - no longer used???
+   local.pair.startup    - [deprecated] can contain a special value indicating for a pair that we have the master copy.
+                           used when replacing other half of the pair which has permanently failed.
+   local.pair.sync       - [deprecated] { initialsynccomplete: 1 }
+*/
+
+#include "pch.h"
+#include "jsobj.h"
+#include "../util/goodies.h"
+#include "repl.h"
+#include "../util/net/message.h"
+#include "../util/background.h"
+#include "../client/dbclient.h"
+#include "../client/connpool.h"
+#include "pdfile.h"
+#include "ops/query.h"
+#include "db.h"
+#include "commands.h"
+#include "security.h"
+#include "cmdline.h"
+#include "repl_block.h"
+#include "repl/rs.h"
+#include "replutil.h"
+#include "repl/connections.h"
+#include "ops/update.h"
+
+namespace mongo {
+
+    // our config from command line etc.
+    ReplSettings replSettings;
+
+    /* if 1 sync() is running */
+    volatile int syncing = 0;
+    static volatile int relinquishSyncingSome = 0;
+
+    /* "dead" means something really bad happened like replication falling completely out of sync.
+       when non-null, we are dead and the string is informational
+    */
+    const char *replAllDead = 0;
+
+    time_t lastForcedResync = 0;
+
+} // namespace mongo
+
+namespace mongo {
+
+    /* output by the web console */
+    const char *replInfo = "";
+    struct ReplInfo {
+        ReplInfo(const char *msg) {
+            replInfo = msg;
+        }
+        ~ReplInfo() {
+            replInfo = "?";
+        }
+    };
+
+    /* operator requested resynchronization of replication (on the slave).  { resync : 1 } */
+    class CmdResync : public Command {
+    public:
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual bool adminOnly() const {
+            return true;
+        }
+        virtual bool logTheOp() { return false; }
+        virtual LockType locktype() const { return WRITE; }
+        void help(stringstream&h) const { h << "resync (from scratch) an out of date replica slave.\nhttp://www.mongodb.org/display/DOCS/Master+Slave"; }
+        CmdResync() : Command("resync") { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( cmdLine.usingReplSets() ) {
+                errmsg = "resync command not currently supported with replica sets.  See RS102 info in the mongodb documentations";
+                result.append("info", "http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member");
+                return false;
+            }
+
+            if ( cmdObj.getBoolField( "force" ) ) {
+                if ( !waitForSyncToFinish( errmsg ) )
+                    return false;
+                replAllDead = "resync forced";
+            }
+            if ( !replAllDead ) {
+                errmsg = "not dead, no need to resync";
+                return false;
+            }
+            if ( !waitForSyncToFinish( errmsg ) )
+                return false;
+
+            ReplSource::forceResyncDead( "client" );
+            result.append( "info", "triggered resync for all sources" );
+            return true;
+        }
+        bool waitForSyncToFinish( string &errmsg ) const {
+            // Wait for slave thread to finish syncing, so sources will be be
+            // reloaded with new saved state on next pass.
+            Timer t;
+            while ( 1 ) {
+                if ( syncing == 0 || t.millis() > 30000 )
+                    break;
+                {
+                    dbtemprelease t;
+                    relinquishSyncingSome = 1;
+                    sleepmillis(1);
+                }
+            }
+            if ( syncing ) {
+                errmsg = "timeout waiting for sync() to finish";
+                return false;
+            }
+            return true;
+        }
+    } cmdResync;
+
+    bool anyReplEnabled() {
+        return replSettings.slave || replSettings.master || theReplSet;
+    }
+
+    bool replAuthenticate(DBClientBase *conn);
+
+    void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level ) {
+
+        if ( replSet ) {
+            if( theReplSet == 0 ) {
+                result.append("ismaster", false);
+                result.append("secondary", false);
+                result.append("info", ReplSet::startupStatusMsg.get());
+                result.append( "isreplicaset" , true );
+                return;
+            }
+
+            theReplSet->fillIsMaster(result);
+            return;
+        }
+
+        if ( replAllDead ) {
+            result.append("ismaster", 0);
+            string s = string("dead: ") + replAllDead;
+            result.append("info", s);
+        }
+        else {
+            result.appendBool("ismaster", _isMaster() );
+        }
+
+        if ( level && replSet ) {
+            result.append( "info" , "is replica set" );
+        }
+        else if ( level ) {
+            BSONObjBuilder sources( result.subarrayStart( "sources" ) );
+
+            readlock lk( "local.sources" );
+            Client::Context ctx( "local.sources", dbpath, authed );
+            shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
+            int n = 0;
+            while ( c->ok() ) {
+                BSONObj s = c->current();
+
+                BSONObjBuilder bb;
+                bb.append( s["host"] );
+                string sourcename = s["source"].valuestr();
+                if ( sourcename != "main" )
+                    bb.append( s["source"] );
+
+                {
+                    BSONElement e = s["syncedTo"];
+                    BSONObjBuilder t( bb.subobjStart( "syncedTo" ) );
+                    t.appendDate( "time" , e.timestampTime() );
+                    t.append( "inc" , e.timestampInc() );
+                    t.done();
+                }
+
+                if ( level > 1 ) {
+                    dbtemprelease unlock;
+                    // note: there is no so-style timeout on this connection; perhaps we should have one.
+                    ScopedDbConnection conn( s["host"].valuestr() );
+                    DBClientConnection *cliConn = dynamic_cast< DBClientConnection* >( &conn.conn() );
+                    if ( cliConn && replAuthenticate( cliConn ) ) {
+                        BSONObj first = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << 1 ) ) );
+                        BSONObj last = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << -1 ) ) );
+                        bb.appendDate( "masterFirst" , first["ts"].timestampTime() );
+                        bb.appendDate( "masterLast" , last["ts"].timestampTime() );
+                        double lag = (double) (last["ts"].timestampTime() - s["syncedTo"].timestampTime());
+                        bb.append( "lagSeconds" , lag / 1000 );
+                    }
+                    conn.done();
+                }
+
+                sources.append( BSONObjBuilder::numStr( n++ ) , bb.obj() );
+                c->advance();
+            }
+
+            sources.done();
+        }
+    }
+
+    class CmdIsMaster : public Command {
+    public:
+        virtual bool requiresAuth() { return false; }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual void help( stringstream &help ) const {
+            help << "Check if this server is primary for a replica pair/set; also if it is --master or --slave in simple master/slave setups.\n";
+            help << "{ isMaster : 1 }";
+        }
+        virtual LockType locktype() const { return NONE; }
+        CmdIsMaster() : Command("isMaster", true, "ismaster") { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
+            /* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not
+               authenticated.
+               we allow unauthenticated ismaster but we aren't as verbose informationally if
+               one is not authenticated for admin db to be safe.
+            */
+            bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin");
+            appendReplicationInfo( result , authed );
+
+            result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize);
+            return true;
+        }
+    } cmdismaster;
+
+    ReplSource::ReplSource() {
+        nClonedThisPass = 0;
+    }
+
+    ReplSource::ReplSource(BSONObj o) : nClonedThisPass(0) {
+        only = o.getStringField("only");
+        hostName = o.getStringField("host");
+        _sourceName = o.getStringField("source");
+        uassert( 10118 ,  "'host' field not set in sources collection object", !hostName.empty() );
+        uassert( 10119 ,  "only source='main' allowed for now with replication", sourceName() == "main" );
+        BSONElement e = o.getField("syncedTo");
+        if ( !e.eoo() ) {
+            uassert( 10120 ,  "bad sources 'syncedTo' field value", e.type() == Date || e.type() == Timestamp );
+            OpTime tmp( e.date() );
+            syncedTo = tmp;
+        }
+
+        BSONObj dbsObj = o.getObjectField("dbsNextPass");
+        if ( !dbsObj.isEmpty() ) {
+            BSONObjIterator i(dbsObj);
+            while ( 1 ) {
+                BSONElement e = i.next();
+                if ( e.eoo() )
+                    break;
+                addDbNextPass.insert( e.fieldName() );
+            }
+        }
+
+        dbsObj = o.getObjectField("incompleteCloneDbs");
+        if ( !dbsObj.isEmpty() ) {
+            BSONObjIterator i(dbsObj);
+            while ( 1 ) {
+                BSONElement e = i.next();
+                if ( e.eoo() )
+                    break;
+                incompleteCloneDbs.insert( e.fieldName() );
+            }
+        }
+    }
+
+    /* Turn our C++ Source object into a BSONObj */
+    BSONObj ReplSource::jsobj() {
+        BSONObjBuilder b;
+        b.append("host", hostName);
+        b.append("source", sourceName());
+        if ( !only.empty() )
+            b.append("only", only);
+        if ( !syncedTo.isNull() )
+            b.appendTimestamp("syncedTo", syncedTo.asDate());
+
+        BSONObjBuilder dbsNextPassBuilder;
+        int n = 0;
+        for ( set<string>::iterator i = addDbNextPass.begin(); i != addDbNextPass.end(); i++ ) {
+            n++;
+            dbsNextPassBuilder.appendBool(*i, 1);
+        }
+        if ( n )
+            b.append("dbsNextPass", dbsNextPassBuilder.done());
+
+        BSONObjBuilder incompleteCloneDbsBuilder;
+        n = 0;
+        for ( set<string>::iterator i = incompleteCloneDbs.begin(); i != incompleteCloneDbs.end(); i++ ) {
+            n++;
+            incompleteCloneDbsBuilder.appendBool(*i, 1);
+        }
+        if ( n )
+            b.append("incompleteCloneDbs", incompleteCloneDbsBuilder.done());
+
+        return b.obj();
+    }
+
+    void ReplSource::save() {
+        BSONObjBuilder b;
+        assert( !hostName.empty() );
+        b.append("host", hostName);
+        // todo: finish allowing multiple source configs.
+        // this line doesn't work right when source is null, if that is allowed as it is now:
+        //b.append("source", _sourceName);
+        BSONObj pattern = b.done();
+
+        BSONObj o = jsobj();
+        log( 1 ) << "Saving repl source: " << o << endl;
+
+        {
+            OpDebug debug;
+            Client::Context ctx("local.sources");
+            UpdateResult res = updateObjects("local.sources", o, pattern, true/*upsert for pair feature*/, false,false,debug);
+            assert( ! res.mod );
+            assert( res.num == 1 );
+        }
+    }
+
+    static void addSourceToList(ReplSource::SourceVector &v, ReplSource& s, ReplSource::SourceVector &old) {
+        if ( !s.syncedTo.isNull() ) { // Don't reuse old ReplSource if there was a forced resync.
+            for ( ReplSource::SourceVector::iterator i = old.begin(); i != old.end();  ) {
+                if ( s == **i ) {
+                    v.push_back(*i);
+                    old.erase(i);
+                    return;
+                }
+                i++;
+            }
+        }
+
+        v.push_back( shared_ptr< ReplSource >( new ReplSource( s ) ) );
+    }
+
+    /* we reuse our existing objects so that we can keep our existing connection
+       and cursor in effect.
+    */
+    void ReplSource::loadAll(SourceVector &v) {
+        Client::Context ctx("local.sources");
+        SourceVector old = v;
+        v.clear();
+
+        if ( !cmdLine.source.empty() ) {
+            // --source <host> specified.
+            // check that no items are in sources other than that
+            // add if missing
+            shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
+            int n = 0;
+            while ( c->ok() ) {
+                n++;
+                ReplSource tmp(c->current());
+                if ( tmp.hostName != cmdLine.source ) {
+                    log() << "repl: --source " << cmdLine.source << " != " << tmp.hostName << " from local.sources collection" << endl;
+                    log() << "repl: for instructions on changing this slave's source, see:" << endl;
+                    log() << "http://dochub.mongodb.org/core/masterslave" << endl;
+                    log() << "repl: terminating mongod after 30 seconds" << endl;
+                    sleepsecs(30);
+                    dbexit( EXIT_REPLICATION_ERROR );
+                }
+                if ( tmp.only != cmdLine.only ) {
+                    log() << "--only " << cmdLine.only << " != " << tmp.only << " from local.sources collection" << endl;
+                    log() << "terminating after 30 seconds" << endl;
+                    sleepsecs(30);
+                    dbexit( EXIT_REPLICATION_ERROR );
+                }
+                c->advance();
+            }
+            uassert( 10002 ,  "local.sources collection corrupt?", n<2 );
+            if ( n == 0 ) {
+                // source missing.  add.
+                ReplSource s;
+                s.hostName = cmdLine.source;
+                s.only = cmdLine.only;
+                s.save();
+            }
+        }
+        else {
+            try {
+                massert( 10384 , "--only requires use of --source", cmdLine.only.empty());
+            }
+            catch ( ... ) {
+                dbexit( EXIT_BADOPTIONS );
+            }
+        }
+
+        shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
+        while ( c->ok() ) {
+            ReplSource tmp(c->current());
+            if ( tmp.syncedTo.isNull() ) {
+                DBDirectClient c;
+                if ( c.exists( "local.oplog.$main" ) ) {
+                    BSONObj op = c.findOne( "local.oplog.$main", QUERY( "op" << NE << "n" ).sort( BSON( "$natural" << -1 ) ) );
+                    if ( !op.isEmpty() ) {
+                        tmp.syncedTo = op[ "ts" ].date();
+                    }
+                }
+            }
+            addSourceToList(v, tmp, old);
+            c->advance();
+        }
+    }
+
+    BSONObj opTimeQuery = fromjson("{\"getoptime\":1}");
+
+    bool ReplSource::throttledForceResyncDead( const char *requester ) {
+        if ( time( 0 ) - lastForcedResync > 600 ) {
+            forceResyncDead( requester );
+            lastForcedResync = time( 0 );
+            return true;
+        }
+        return false;
+    }
+
+    void ReplSource::forceResyncDead( const char *requester ) {
+        if ( !replAllDead )
+            return;
+        SourceVector sources;
+        ReplSource::loadAll(sources);
+        for( SourceVector::iterator i = sources.begin(); i != sources.end(); ++i ) {
+            log() << requester << " forcing resync from "  << (*i)->hostName << endl;
+            (*i)->forceResync( requester );
+        }
+        replAllDead = 0;
+    }
+
+    void ReplSource::forceResync( const char *requester ) {
+        BSONObj info;
+        {
+            dbtemprelease t;
+            if (!oplogReader.connect(hostName)) {
+                msgassertedNoTrace( 14051 , "unable to connect to resync");
+            }
+            /* todo use getDatabaseNames() method here */
+            bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
+            massert( 10385 ,  "Unable to get database list", ok );
+        }
+        BSONObjIterator i( info.getField( "databases" ).embeddedObject() );
+        while( i.moreWithEOO() ) {
+            BSONElement e = i.next();
+            if ( e.eoo() )
+                break;
+            string name = e.embeddedObject().getField( "name" ).valuestr();
+            if ( !e.embeddedObject().getBoolField( "empty" ) ) {
+                if ( name != "local" ) {
+                    if ( only.empty() || only == name ) {
+                        resyncDrop( name.c_str(), requester );
+                    }
+                }
+            }
+        }
+        syncedTo = OpTime();
+        addDbNextPass.clear();
+        save();
+    }
+
+    string ReplSource::resyncDrop( const char *db, const char *requester ) {
+        log() << "resync: dropping database " << db << endl;
+        Client::Context ctx(db);
+        dropDatabase(db);
+        return db;
+    }
+
+    /* grab initial copy of a database from the master */
+    void ReplSource::resync(string db) {
+        string dummyNs = resyncDrop( db.c_str(), "internal" );
+        Client::Context ctx( dummyNs );
+        {
+            log() << "resync: cloning database " << db << " to get an initial copy" << endl;
+            ReplInfo r("resync: cloning a database");
+            string errmsg;
+            int errCode = 0;
+            bool ok = cloneFrom(hostName.c_str(), errmsg, cc().database()->name, false, /*slaveOk*/ true, /*replauth*/ true, /*snapshot*/false, /*mayYield*/true, /*mayBeInterrupted*/false, &errCode);
+            if ( !ok ) {
+                if ( errCode == DatabaseDifferCaseCode ) {
+                    resyncDrop( db.c_str(), "internal" );
+                    log() << "resync: database " << db << " not valid on the master due to a name conflict, dropping." << endl;
+                    return;
+                }
+                else {
+                    problem() << "resync of " << db << " from " << hostName << " failed " << errmsg << endl;
+                    throw SyncException();
+                }
+            }
+        }
+
+        log() << "resync: done with initial clone for db: " << db << endl;
+
+        return;
+    }
+    
+    DatabaseIgnorer ___databaseIgnorer;
+    
+    void DatabaseIgnorer::doIgnoreUntilAfter( const string &db, const OpTime &futureOplogTime ) {
+        if ( futureOplogTime > _ignores[ db ] ) {
+            _ignores[ db ] = futureOplogTime;   
+        }
+    }
+
+    bool DatabaseIgnorer::ignoreAt( const string &db, const OpTime &currentOplogTime ) {
+        if ( _ignores[ db ].isNull() ) {
+            return false;
+        }
+        if ( _ignores[ db ] >= currentOplogTime ) {
+            return true;
+        } else {
+            // The ignore state has expired, so clear it.
+            _ignores.erase( db );
+            return false;
+        }
+    }
+
+    bool ReplSource::handleDuplicateDbName( const BSONObj &op, const char *ns, const char *db ) {
+        if ( dbHolder()._isLoaded( ns, dbpath ) ) {
+            // Database is already present.
+            return true;   
+        }
+        BSONElement ts = op.getField( "ts" );
+        if ( ( ts.type() == Date || ts.type() == Timestamp ) && ___databaseIgnorer.ignoreAt( db, ts.date() ) ) {
+            // Database is ignored due to a previous indication that it is
+            // missing from master after optime "ts".
+            return false;   
+        }
+        if ( Database::duplicateUncasedName( false, db, dbpath ).empty() ) {
+            // No duplicate database names are present.
+            return true;
+        }
+        
+        OpTime lastTime;
+        bool dbOk = false;
+        {
+            dbtemprelease release;
+        
+            // We always log an operation after executing it (never before), so
+            // a database list will always be valid as of an oplog entry generated
+            // before it was retrieved.
+            
+            BSONObj last = oplogReader.findOne( this->ns().c_str(), Query().sort( BSON( "$natural" << -1 ) ) );
+            if ( !last.isEmpty() ) {
+	            BSONElement ts = last.getField( "ts" );
+	            massert( 14032, "Invalid 'ts' in remote log", ts.type() == Date || ts.type() == Timestamp );
+	            lastTime = OpTime( ts.date() );
+            }
+
+            BSONObj info;
+            bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
+            massert( 14033, "Unable to get database list", ok );
+            BSONObjIterator i( info.getField( "databases" ).embeddedObject() );
+            while( i.more() ) {
+                BSONElement e = i.next();
+            
+                const char * name = e.embeddedObject().getField( "name" ).valuestr();
+                if ( strcasecmp( name, db ) != 0 )
+                    continue;
+                
+                if ( strcmp( name, db ) == 0 ) {
+                    // The db exists on master, still need to check that no conflicts exist there.
+                    dbOk = true;
+                    continue;
+                }
+                
+                // The master has a db name that conflicts with the requested name.
+                dbOk = false;
+                break;
+            }
+        }
+        
+        if ( !dbOk ) {
+            ___databaseIgnorer.doIgnoreUntilAfter( db, lastTime );
+            incompleteCloneDbs.erase(db);
+            addDbNextPass.erase(db);
+            return false;   
+        }
+        
+        // Check for duplicates again, since we released the lock above.
+        set< string > duplicates;
+        Database::duplicateUncasedName( false, db, dbpath, &duplicates );
+        
+        // The database is present on the master and no conflicting databases
+        // are present on the master.  Drop any local conflicts.
+        for( set< string >::const_iterator i = duplicates.begin(); i != duplicates.end(); ++i ) {
+            ___databaseIgnorer.doIgnoreUntilAfter( *i, lastTime );
+            incompleteCloneDbs.erase(*i);
+            addDbNextPass.erase(*i);
+            Client::Context ctx(*i);
+            dropDatabase(*i);
+        }
+        
+        massert( 14034, "Duplicate database names present after attempting to delete duplicates",
+                Database::duplicateUncasedName( false, db, dbpath ).empty() );
+        return true;
+    }
+
+    void ReplSource::applyOperation(const BSONObj& op) {
+        try {
+            bool failedUpdate = applyOperation_inlock( op );
+            if (failedUpdate) {
+                Sync sync(hostName);
+                if (sync.shouldRetry(op)) {
+                    uassert(15914, "Failure retrying initial sync update", !applyOperation_inlock(op));
+                }
+            }
+        }
+        catch ( UserException& e ) {
+            log() << "sync: caught user assertion " << e << " while applying op: " << op << endl;;
+        }
+        catch ( DBException& e ) {
+            log() << "sync: caught db exception " << e << " while applying op: " << op << endl;;
+        }
+
+    }
+
+    /* local.$oplog.main is of the form:
+         { ts: ..., op: <optype>, ns: ..., o: <obj> , o2: <extraobj>, b: <boolflag> }
+         ...
+       see logOp() comments.
+
+       @param alreadyLocked caller already put us in write lock if true
+    */
+    void ReplSource::sync_pullOpLog_applyOperation(BSONObj& op, bool alreadyLocked) {
+        if( logLevel >= 6 ) // op.tostring is expensive so doing this check explicitly
+            log(6) << "processing op: " << op << endl;
+
+        if( op.getStringField("op")[0] == 'n' )
+            return;
+
+        char clientName[MaxDatabaseNameLen];
+        const char *ns = op.getStringField("ns");
+        nsToDatabase(ns, clientName);
+
+        if ( *ns == '.' ) {
+            problem() << "skipping bad op in oplog: " << op.toString() << endl;
+            return;
+        }
+        else if ( *ns == 0 ) {
+            /*if( op.getStringField("op")[0] != 'n' )*/ {
+                problem() << "halting replication, bad op in oplog:\n  " << op.toString() << endl;
+                replAllDead = "bad object in oplog";
+                throw SyncException();
+            }
+            //ns = "local.system.x";
+            //nsToDatabase(ns, clientName);
+        }
+
+        if ( !only.empty() && only != clientName )
+            return;
+
+        if( cmdLine.pretouch && !alreadyLocked/*doesn't make sense if in write lock already*/ ) {
+            if( cmdLine.pretouch > 1 ) {
+                /* note: this is bad - should be put in ReplSource.  but this is first test... */
+                static int countdown;
+                assert( countdown >= 0 );
+                if( countdown > 0 ) {
+                    countdown--; // was pretouched on a prev pass
+                }
+                else {
+                    const int m = 4;
+                    if( tp.get() == 0 ) {
+                        int nthr = min(8, cmdLine.pretouch);
+                        nthr = max(nthr, 1);
+                        tp.reset( new ThreadPool(nthr) );
+                    }
+                    vector<BSONObj> v;
+                    oplogReader.peek(v, cmdLine.pretouch);
+                    unsigned a = 0;
+                    while( 1 ) {
+                        if( a >= v.size() ) break;
+                        unsigned b = a + m - 1; // v[a..b]
+                        if( b >= v.size() ) b = v.size() - 1;
+                        tp->schedule(pretouchN, v, a, b);
+                        DEV cout << "pretouch task: " << a << ".." << b << endl;
+                        a += m;
+                    }
+                    // we do one too...
+                    pretouchOperation(op);
+                    tp->join();
+                    countdown = v.size();
+                }
+            }
+            else {
+                pretouchOperation(op);
+            }
+        }
+
+        scoped_ptr<writelock> lk( alreadyLocked ? 0 : new writelock() );
+
+        if ( replAllDead ) {
+            // hmmm why is this check here and not at top of this function? does it get set between top and here?
+            log() << "replAllDead, throwing SyncException: " << replAllDead << endl;
+            throw SyncException();
+        }
+
+        if ( !handleDuplicateDbName( op, ns, clientName ) ) {
+            return;   
+        }
+                
+        Client::Context ctx( ns );
+        ctx.getClient()->curop()->reset();
+
+        bool empty = ctx.db()->isEmpty();
+        bool incompleteClone = incompleteCloneDbs.count( clientName ) != 0;
+
+        if( logLevel >= 6 )
+            log(6) << "ns: " << ns << ", justCreated: " << ctx.justCreated() << ", empty: " << empty << ", incompleteClone: " << incompleteClone << endl;
+
+        // always apply admin command command
+        // this is a bit hacky -- the semantics of replication/commands aren't well specified
+        if ( strcmp( clientName, "admin" ) == 0 && *op.getStringField( "op" ) == 'c' ) {
+            applyOperation( op );
+            return;
+        }
+
+        if ( ctx.justCreated() || empty || incompleteClone ) {
+            // we must add to incomplete list now that setClient has been called
+            incompleteCloneDbs.insert( clientName );
+            if ( nClonedThisPass ) {
+                /* we only clone one database per pass, even if a lot need done.  This helps us
+                 avoid overflowing the master's transaction log by doing too much work before going
+                 back to read more transactions. (Imagine a scenario of slave startup where we try to
+                 clone 100 databases in one pass.)
+                 */
+                addDbNextPass.insert( clientName );
+            }
+            else {
+                if ( incompleteClone ) {
+                    log() << "An earlier initial clone of '" << clientName << "' did not complete, now resyncing." << endl;
+                }
+                save();
+                Client::Context ctx(ns);
+                nClonedThisPass++;
+                resync(ctx.db()->name);
+                addDbNextPass.erase(clientName);
+                incompleteCloneDbs.erase( clientName );
+            }
+            save();
+        }
+        else {
+            applyOperation( op );
+            addDbNextPass.erase( clientName );
+        }
+    }
+
+    void ReplSource::syncToTailOfRemoteLog() {
+        string _ns = ns();
+        BSONObjBuilder b;
+        if ( !only.empty() ) {
+            b.appendRegex("ns", string("^") + only);
+        }
+        BSONObj last = oplogReader.findOne( _ns.c_str(), Query( b.done() ).sort( BSON( "$natural" << -1 ) ) );
+        if ( !last.isEmpty() ) {
+            BSONElement ts = last.getField( "ts" );
+            massert( 10386 ,  "non Date ts found: " + last.toString(), ts.type() == Date || ts.type() == Timestamp );
+            syncedTo = OpTime( ts.date() );
+        }
+    }
+
+    extern unsigned replApplyBatchSize;
+
+    /* slave: pull some data from the master's oplog
+       note: not yet in db mutex at this point.
+       @return -1 error
+               0 ok, don't sleep
+               1 ok, sleep
+    */
+    int ReplSource::sync_pullOpLog(int& nApplied) {
+        int okResultCode = 1;
+        string ns = string("local.oplog.$") + sourceName();
+        log(2) << "repl: sync_pullOpLog " << ns << " syncedTo:" << syncedTo.toStringLong() << '\n';
+
+        bool tailing = true;
+        oplogReader.tailCheck();
+
+        bool initial = syncedTo.isNull();
+
+        if ( !oplogReader.haveCursor() || initial ) {
+            if ( initial ) {
+                // Important to grab last oplog timestamp before listing databases.
+                syncToTailOfRemoteLog();
+                BSONObj info;
+                bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info );
+                massert( 10389 ,  "Unable to get database list", ok );
+                BSONObjIterator i( info.getField( "databases" ).embeddedObject() );
+                while( i.moreWithEOO() ) {
+                    BSONElement e = i.next();
+                    if ( e.eoo() )
+                        break;
+                    string name = e.embeddedObject().getField( "name" ).valuestr();
+                    if ( !e.embeddedObject().getBoolField( "empty" ) ) {
+                        if ( name != "local" ) {
+                            if ( only.empty() || only == name ) {
+                                log( 2 ) << "adding to 'addDbNextPass': " << name << endl;
+                                addDbNextPass.insert( name );
+                            }
+                        }
+                    }
+                }
+                dblock lk;
+                save();
+            }
+
+            BSONObjBuilder q;
+            q.appendDate("$gte", syncedTo.asDate());
+            BSONObjBuilder query;
+            query.append("ts", q.done());
+            if ( !only.empty() ) {
+                // note we may here skip a LOT of data table scanning, a lot of work for the master.
+                query.appendRegex("ns", string("^") + only); // maybe append "\\." here?
+            }
+            BSONObj queryObj = query.done();
+            // e.g. queryObj = { ts: { $gte: syncedTo } }
+
+            oplogReader.tailingQuery(ns.c_str(), queryObj);
+            tailing = false;
+        }
+        else {
+            log(2) << "repl: tailing=true\n";
+        }
+
+        if( !oplogReader.haveCursor() ) {
+            problem() << "repl: dbclient::query returns null (conn closed?)" << endl;
+            oplogReader.resetConnection();
+            return -1;
+        }
+
+        // show any deferred database creates from a previous pass
+        {
+            set<string>::iterator i = addDbNextPass.begin();
+            if ( i != addDbNextPass.end() ) {
+                BSONObjBuilder b;
+                b.append("ns", *i + '.');
+                b.append("op", "db");
+                BSONObj op = b.done();
+                sync_pullOpLog_applyOperation(op, false);
+            }
+        }
+
+        if ( !oplogReader.more() ) {
+            if ( tailing ) {
+                log(2) << "repl: tailing & no new activity\n";
+                if( oplogReader.awaitCapable() )
+                    okResultCode = 0; // don't sleep
+
+            }
+            else {
+                log() << "repl:   " << ns << " oplog is empty\n";
+            }
+            {
+                dblock lk;
+                save();
+            }
+            return okResultCode;
+        }
+
+        OpTime nextOpTime;
+        {
+            BSONObj op = oplogReader.next();
+            BSONElement ts = op.getField("ts");
+            if ( ts.type() != Date && ts.type() != Timestamp ) {
+                string err = op.getStringField("$err");
+                if ( !err.empty() ) {
+                    // 13051 is "tailable cursor requested on non capped collection"
+                    if (op.getIntField("code") == 13051) {
+                        problem() << "trying to slave off of a non-master" << '\n';
+                        massert( 13344 ,  "trying to slave off of a non-master", false );
+                    }
+                    else {
+                        problem() << "repl: $err reading remote oplog: " + err << '\n';
+                        massert( 10390 ,  "got $err reading remote oplog", false );
+                    }
+                }
+                else {
+                    problem() << "repl: bad object read from remote oplog: " << op.toString() << '\n';
+                    massert( 10391 , "repl: bad object read from remote oplog", false);
+                }
+            }
+
+            nextOpTime = OpTime( ts.date() );
+            log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n';
+            if ( initial ) {
+                log(1) << "repl:   initial run\n";
+            }
+            if( tailing ) {
+                if( !( syncedTo < nextOpTime ) ) {
+                    log() << "repl ASSERTION failed : syncedTo < nextOpTime" << endl;
+                    log() << "repl syncTo:     " << syncedTo.toStringLong() << endl;
+                    log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl;
+                    assert(false);
+                }
+                oplogReader.putBack( op ); // op will be processed in the loop below
+                nextOpTime = OpTime(); // will reread the op below
+            }
+            else if ( nextOpTime != syncedTo ) { // didn't get what we queried for - error
+                Nullstream& l = log();
+                l << "repl:   nextOpTime " << nextOpTime.toStringLong() << ' ';
+                if ( nextOpTime < syncedTo )
+                    l << "<??";
+                else
+                    l << ">";
+
+                l << " syncedTo " << syncedTo.toStringLong() << '\n';
+                log() << "repl:   time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n";
+                log() << "repl:   tailing: " << tailing << '\n';
+                log() << "repl:   data too stale, halting replication" << endl;
+                replInfo = replAllDead = "data too stale halted replication";
+                assert( syncedTo < nextOpTime );
+                throw SyncException();
+            }
+            else {
+                /* t == syncedTo, so the first op was applied previously or it is the first op of initial query and need not be applied. */
+            }
+        }
+
+        // apply operations
+        {
+            int n = 0;
+            time_t saveLast = time(0);
+            while ( 1 ) {
+
+                bool moreInitialSyncsPending = !addDbNextPass.empty() && n; // we need "&& n" to assure we actually process at least one op to get a sync point recorded in the first place.
+
+                if ( moreInitialSyncsPending || !oplogReader.more() ) {
+                    dblock lk;
+
+                    // NOTE aaron 2011-03-29 This block may be unnecessary, but I'm leaving it in place to avoid changing timing behavior.
+                    {
+                        dbtemprelease t;
+                        if ( !moreInitialSyncsPending && oplogReader.more() ) {
+                            continue;
+                        }
+                        // otherwise, break out of loop so we can set to completed or clone more dbs
+                    }
+                    
+                    if( oplogReader.awaitCapable() && tailing )
+                        okResultCode = 0; // don't sleep
+                    syncedTo = nextOpTime;
+                    save(); // note how far we are synced up to now
+                    log() << "repl:   applied " << n << " operations" << endl;
+                    nApplied = n;
+                    log() << "repl:  end sync_pullOpLog syncedTo: " << syncedTo.toStringLong() << endl;
+                    break;
+                }
+                else {
+                }
+
+                OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) {
+                    // periodically note our progress, in case we are doing a lot of work and crash
+                    dblock lk;
+                    syncedTo = nextOpTime;
+                    // can't update local log ts since there are pending operations from our peer
+                    save();
+                    log() << "repl:   checkpoint applied " << n << " operations" << endl;
+                    log() << "repl:   syncedTo: " << syncedTo.toStringLong() << endl;
+                    saveLast = time(0);
+                    n = 0;
+                }
+
+                BSONObj op = oplogReader.next();
+
+                unsigned b = replApplyBatchSize;
+                bool justOne = b == 1;
+                scoped_ptr<writelock> lk( justOne ? 0 : new writelock() );
+                while( 1 ) {
+
+                    BSONElement ts = op.getField("ts");
+                    if( !( ts.type() == Date || ts.type() == Timestamp ) ) {
+                        log() << "sync error: problem querying remote oplog record" << endl;
+                        log() << "op: " << op.toString() << endl;
+                        log() << "halting replication" << endl;
+                        replInfo = replAllDead = "sync error: no ts found querying remote oplog record";
+                        throw SyncException();
+                    }
+                    OpTime last = nextOpTime;
+                    nextOpTime = OpTime( ts.date() );
+                    if ( !( last < nextOpTime ) ) {
+                        log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl;
+                        log() << " last:       " << last.toStringLong() << endl;
+                        log() << " nextOpTime: " << nextOpTime.toStringLong() << endl;
+                        log() << " halting replication" << endl;
+                        replInfo = replAllDead = "sync error last >= nextOpTime";
+                        uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false);
+                    }
+                    if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) {
+                        assert( justOne );
+                        oplogReader.putBack( op );
+                        _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1;
+                        dblock lk;
+                        if ( n > 0 ) {
+                            syncedTo = last;
+                            save();
+                        }
+                        log() << "repl:   applied " << n << " operations" << endl;
+                        log() << "repl:   syncedTo: " << syncedTo.toStringLong() << endl;
+                        log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl;
+                        return okResultCode;
+                    }
+
+                    sync_pullOpLog_applyOperation(op, !justOne);
+                    n++;
+
+                    if( --b == 0 )
+                        break;
+                    // if to here, we are doing mulpile applications in a singel write lock acquisition
+                    if( !oplogReader.moreInCurrentBatch() ) {
+                        // break if no more in batch so we release lock while reading from the master
+                        break;
+                    }
+                    op = oplogReader.next();
+
+                    getDur().commitIfNeeded();
+                }
+            }
+        }
+
+        return okResultCode;
+    }
+
+    BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}");
+
+    bool replAuthenticate(DBClientBase *conn) {
+        if( noauth ) {
+            return true;
+        }
+        if( ! cc().isAdmin() ) {
+            log() << "replauthenticate: requires admin permissions, failing\n";
+            return false;
+        }
+
+        string u;
+        string p;
+        if (internalSecurity.pwd.length() > 0) {
+            u = internalSecurity.user;
+            p = internalSecurity.pwd;
+        }
+        else {
+            BSONObj user;
+            {
+                dblock lk;
+                Client::Context ctxt("local.");
+                if( !Helpers::findOne("local.system.users", userReplQuery, user) ||
+                        // try the first user in local
+                        !Helpers::getSingleton("local.system.users", user) ) {
+                    log() << "replauthenticate: no user in local.system.users to use for authentication\n";
+                    return false;
+                }
+            }
+            u = user.getStringField("user");
+            p = user.getStringField("pwd");
+            massert( 10392 , "bad user object? [1]", !u.empty());
+            massert( 10393 , "bad user object? [2]", !p.empty());
+        }
+
+        string err;
+        if( !conn->auth("local", u.c_str(), p.c_str(), err, false) ) {
+            log() << "replauthenticate: can't authenticate to master server, user:" << u << endl;
+            return false;
+        }
+        return true;
+    }
+
+    bool replHandshake(DBClientConnection *conn) {
+
+        string myname = getHostName();
+
+        BSONObj me;
+        {
+            
+            dblock l;
+            // local.me is an identifier for a server for getLastError w:2+
+            if ( ! Helpers::getSingleton( "local.me" , me ) ||
+                 ! me.hasField("host") ||
+                 me["host"].String() != myname ) {
+
+                // clean out local.me
+                Helpers::emptyCollection("local.me");
+
+                // repopulate
+                BSONObjBuilder b;
+                b.appendOID( "_id" , 0 , true );
+                b.append( "host", myname );
+                me = b.obj();
+                Helpers::putSingleton( "local.me" , me );
+            }
+        }
+
+        BSONObjBuilder cmd;
+        cmd.appendAs( me["_id"] , "handshake" );
+        if (theReplSet) {
+            cmd.append("member", theReplSet->selfId());
+        }
+
+        BSONObj res;
+        bool ok = conn->runCommand( "admin" , cmd.obj() , res );
+        // ignoring for now on purpose for older versions
+        log(ok) << "replHandshake res not: " << ok << " res: " << res << endl;
+        return true;
+    }
+
+    bool OplogReader::commonConnect(const string& hostName) {
+        if( conn() == 0 ) {
+            _conn = shared_ptr<DBClientConnection>(new DBClientConnection( false, 0, 0 /* tcp timeout */));
+            string errmsg;
+            ReplInfo r("trying to connect to sync source");
+            if ( !_conn->connect(hostName.c_str(), errmsg) ||
+                 (!noauth && !replAuthenticate(_conn.get())) ) {
+                resetConnection();
+                log() << "repl: " << errmsg << endl;
+                return false;
+            }
+        }
+        return true;
+    }
+    
+    bool OplogReader::connect(string hostName) {
+        if (conn() != 0) {
+            return true;
+        }
+
+        if (commonConnect(hostName)) {
+            return replHandshake(_conn.get());
+        }
+        return false;
+    }
+
+    bool OplogReader::connect(const BSONObj& rid, const int from, const string& to) {
+        if (conn() != 0) {
+            return true;
+        }
+        if (commonConnect(to)) {
+            log() << "handshake between " << from << " and " << to << endl;
+            return passthroughHandshake(rid, from);
+        }
+        return false;
+    }
+
+    bool OplogReader::passthroughHandshake(const BSONObj& rid, const int f) {
+        BSONObjBuilder cmd;
+        cmd.appendAs( rid["_id"], "handshake" );
+        cmd.append( "member" , f );
+
+        BSONObj res;
+        return conn()->runCommand( "admin" , cmd.obj() , res );
+    }
+
+    /* note: not yet in mutex at this point.
+       returns >= 0 if ok.  return -1 if you want to reconnect.
+       return value of zero indicates no sleep necessary before next call
+    */
+    int ReplSource::sync(int& nApplied) {
+        _sleepAdviceTime = 0;
+        ReplInfo r("sync");
+        if ( !cmdLine.quiet ) {
+            Nullstream& l = log();
+            l << "repl: syncing from ";
+            if( sourceName() != "main" ) {
+                l << "source:" << sourceName() << ' ';
+            }
+            l << "host:" << hostName << endl;
+        }
+        nClonedThisPass = 0;
+
+        // FIXME Handle cases where this db isn't on default port, or default port is spec'd in hostName.
+        if ( (string("localhost") == hostName || string("127.0.0.1") == hostName) && cmdLine.port == CmdLine::DefaultDBPort ) {
+            log() << "repl:   can't sync from self (localhost). sources configuration may be wrong." << endl;
+            sleepsecs(5);
+            return -1;
+        }
+
+        if ( !oplogReader.connect(hostName) ) {
+            log(4) << "repl:  can't connect to sync source" << endl;
+            return -1;
+        }
+
+        /*
+            // get current mtime at the server.
+            BSONObj o = conn->findOne("admin.$cmd", opTimeQuery);
+            BSONElement e = o.getField("optime");
+            if( e.eoo() ) {
+                log() << "repl:   failed to get cur optime from master" << endl;
+                log() << "        " << o.toString() << endl;
+                return false;
+            }
+            uassert( 10124 ,  e.type() == Date );
+            OpTime serverCurTime;
+            serverCurTime.asDate() = e.date();
+        */
+        return sync_pullOpLog(nApplied);
+    }
+
+    /* --------------------------------------------------------------*/
+
+    /*
+    TODO:
+    _ source has autoptr to the cursor
+    _ reuse that cursor when we can
+    */
+
+    /* returns: # of seconds to sleep before next pass
+                0 = no sleep recommended
+                1 = special sentinel indicating adaptive sleep recommended
+    */
+    int _replMain(ReplSource::SourceVector& sources, int& nApplied) {
+        {
+            ReplInfo r("replMain load sources");
+            dblock lk;
+            ReplSource::loadAll(sources);
+            replSettings.fastsync = false; // only need this param for initial reset
+        }
+
+        if ( sources.empty() ) {
+            /* replication is not configured yet (for --slave) in local.sources.  Poll for config it
+            every 20 seconds.
+            */
+            log() << "no source given, add a master to local.sources to start replication" << endl;
+            return 20;
+        }
+
+        int sleepAdvice = 1;
+        for ( ReplSource::SourceVector::iterator i = sources.begin(); i != sources.end(); i++ ) {
+            ReplSource *s = i->get();
+            int res = -1;
+            try {
+                res = s->sync(nApplied);
+                bool moreToSync = s->haveMoreDbsToSync();
+                if( res < 0 ) {
+                    sleepAdvice = 3;
+                }
+                else if( moreToSync ) {
+                    sleepAdvice = 0;
+                }
+                else if ( s->sleepAdvice() ) {
+                    sleepAdvice = s->sleepAdvice();
+                }
+                else
+                    sleepAdvice = res;
+            }
+            catch ( const SyncException& ) {
+                log() << "caught SyncException" << endl;
+                return 10;
+            }
+            catch ( AssertionException& e ) {
+                if ( e.severe() ) {
+                    log() << "replMain AssertionException " << e.what() << endl;
+                    return 60;
+                }
+                else {
+                    log() << "repl: AssertionException " << e.what() << '\n';
+                }
+                replInfo = "replMain caught AssertionException";
+            }
+            catch ( const DBException& e ) {
+                log() << "repl: DBException " << e.what() << endl;
+                replInfo = "replMain caught DBException";
+            }
+            catch ( const std::exception &e ) {
+                log() << "repl: std::exception " << e.what() << endl;
+                replInfo = "replMain caught std::exception";
+            }
+            catch ( ... ) {
+                log() << "unexpected exception during replication.  replication will halt" << endl;
+                replAllDead = "caught unexpected exception during replication";
+            }
+            if ( res < 0 )
+                s->oplogReader.resetConnection();
+        }
+        return sleepAdvice;
+    }
+
+    void replMain() {
+        ReplSource::SourceVector sources;
+        while ( 1 ) {
+            int s = 0;
+            {
+                dblock lk;
+                if ( replAllDead ) {
+                    // throttledForceResyncDead can throw
+                    if ( !replSettings.autoresync || !ReplSource::throttledForceResyncDead( "auto" ) ) {
+                        log() << "all sources dead: " << replAllDead << ", sleeping for 5 seconds" << endl;
+                        break;
+                    }
+                }
+                assert( syncing == 0 ); // i.e., there is only one sync thread running. we will want to change/fix this.
+                syncing++;
+            }
+            try {
+                int nApplied = 0;
+                s = _replMain(sources, nApplied);
+                if( s == 1 ) {
+                    if( nApplied == 0 ) s = 2;
+                    else if( nApplied > 100 ) {
+                        // sleep very little - just enought that we aren't truly hammering master
+                        sleepmillis(75);
+                        s = 0;
+                    }
+                }
+            }
+            catch (...) {
+                out() << "caught exception in _replMain" << endl;
+                s = 4;
+            }
+            {
+                dblock lk;
+                assert( syncing == 1 );
+                syncing--;
+            }
+
+            if( relinquishSyncingSome )  {
+                relinquishSyncingSome = 0;
+                s = 1; // sleep before going back in to syncing=1
+            }
+
+            if ( s ) {
+                stringstream ss;
+                ss << "repl: sleep " << s << " sec before next pass";
+                string msg = ss.str();
+                if ( ! cmdLine.quiet )
+                    log() << msg << endl;
+                ReplInfo r(msg.c_str());
+                sleepsecs(s);
+            }
+        }
+    }
+
+    static void replMasterThread() {
+        sleepsecs(4);
+        Client::initThread("replmaster");
+        int toSleep = 10;
+        while( 1 ) {
+
+            sleepsecs( toSleep );
+            /* write a keep-alive like entry to the log.  this will make things like
+               printReplicationStatus() and printSlaveReplicationStatus() stay up-to-date
+               even when things are idle.
+            */
+            {
+                writelocktry lk("",1);
+                if ( lk.got() ) {
+                    toSleep = 10;
+
+                    replLocalAuth();
+
+                    try {
+                        logKeepalive();
+                    }
+                    catch(...) {
+                        log() << "caught exception in replMasterThread()" << endl;
+                    }
+                }
+                else {
+                    log(5) << "couldn't logKeepalive" << endl;
+                    toSleep = 1;
+                }
+            }
+        }
+    }
+
+    void replSlaveThread() {
+        sleepsecs(1);
+        Client::initThread("replslave");
+        cc().iAmSyncThread();
+
+        {
+            dblock lk;
+            replLocalAuth();
+        }
+
+        while ( 1 ) {
+            try {
+                replMain();
+                sleepsecs(5);
+            }
+            catch ( AssertionException& ) {
+                ReplInfo r("Assertion in replSlaveThread(): sleeping 5 minutes before retry");
+                problem() << "Assertion in replSlaveThread(): sleeping 5 minutes before retry" << endl;
+                sleepsecs(300);
+            }
+            catch ( DBException& e ) {
+                problem() << "exception in replSlaveThread(): " << e.what()
+                          << ", sleeping 5 minutes before retry" << endl;
+                sleepsecs(300);
+            }
+            catch ( ... ) {
+                problem() << "error in replSlaveThread(): sleeping 5 minutes before retry" << endl;
+                sleepsecs(300);
+            }
+        }
+    }
+
+    void tempThread() {
+        while ( 1 ) {
+            out() << d.dbMutex.info().isLocked() << endl;
+            sleepmillis(100);
+        }
+    }
+
+    void newRepl();
+    void oldRepl();
+    void startReplSets(ReplSetCmdline*);
+    void startReplication() {
+        /* if we are going to be a replica set, we aren't doing other forms of replication. */
+        if( !cmdLine._replSet.empty() ) {
+            if( replSettings.slave || replSettings.master ) {
+                log() << "***" << endl;
+                log() << "ERROR: can't use --slave or --master replication options with --replSet" << endl;
+                log() << "***" << endl;
+            }
+            newRepl();
+
+            replSet = true;
+            ReplSetCmdline *replSetCmdline = new ReplSetCmdline(cmdLine._replSet);
+            boost::thread t( boost::bind( &startReplSets, replSetCmdline) );
+
+            return;
+        }
+
+        oldRepl();
+
+        /* this was just to see if anything locks for longer than it should -- we need to be careful
+           not to be locked when trying to connect() or query() the other side.
+           */
+        //boost::thread tempt(tempThread);
+
+        if( !replSettings.slave && !replSettings.master )
+            return;
+
+        {
+            dblock lk;
+            replLocalAuth();
+        }
+
+        if ( replSettings.slave ) {
+            assert( replSettings.slave == SimpleSlave );
+            log(1) << "slave=true" << endl;
+            boost::thread repl_thread(replSlaveThread);
+        }
+
+        if ( replSettings.master ) {
+            log(1) << "master=true" << endl;
+            replSettings.master = true;
+            createOplog();
+            boost::thread t(replMasterThread);
+        }
+
+        while( replSettings.fastsync ) // don't allow writes until we've set up from log
+            sleepmillis( 50 );
+    }
+
+    void testPretouch() {
+        int nthr = min(8, 8);
+        nthr = max(nthr, 1);
+        int m = 8 / nthr;
+        ThreadPool tp(nthr);
+        vector<BSONObj> v;
+
+        BSONObj x = BSON( "ns" << "test.foo" << "o" << BSON( "_id" << 1 ) << "op" << "i" );
+
+        v.push_back(x);
+        v.push_back(x);
+        v.push_back(x);
+
+        unsigned a = 0;
+        while( 1 ) {
+            if( a >= v.size() ) break;
+            unsigned b = a + m - 1; // v[a..b]
+            if( b >= v.size() ) b = v.size() - 1;
+            tp.schedule(pretouchN, v, a, b);
+            DEV cout << "pretouch task: " << a << ".." << b << endl;
+            a += m;
+        }
+        tp.join();
+    }
+
+    class ReplApplyBatchSizeValidator : public ParameterValidator {
+    public:
+        ReplApplyBatchSizeValidator() : ParameterValidator( "replApplyBatchSize" ) {}
+
+        virtual bool isValid( BSONElement e , string& errmsg ) const {
+            int b = e.numberInt();
+            if( b < 1 || b > 1024 ) {
+                errmsg = "replApplyBatchSize has to be >= 1 and < 1024";
+                return false;
+            }
+
+            if ( replSettings.slavedelay != 0 && b > 1 ) {
+                errmsg = "can't use a batch size > 1 with slavedelay";
+                return false;
+            }
+            if ( ! replSettings.slave ) {
+                errmsg = "can't set replApplyBatchSize on a non-slave machine";
+                return false;
+            }
+
+            return true;
+        }
+    } replApplyBatchSizeValidator;
+
+} // namespace mongo
diff --git a/src/mongo/db/repl.h b/src/mongo/db/repl.h
new file mode 100644
index 00000000000..83242d0a4ce
--- /dev/null
+++ b/src/mongo/db/repl.h
@@ -0,0 +1,199 @@
+// repl.h - replication
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* replication data overview
+
+   at the slave:
+     local.sources { host: ..., source: ..., only: ..., syncedTo: ..., localLogTs: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } }
+
+   at the master:
+     local.oplog.$<source>
+*/
+
+#pragma once
+
+#include "pdfile.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "../client/dbclient.h"
+#include "../util/optime.h"
+#include "oplog.h"
+#include "../util/concurrency/thread_pool.h"
+#include "oplogreader.h"
+#include "cloner.h"
+
+namespace mongo {
+
+    /* replication slave? (possibly with slave)
+       --slave cmd line setting -> SimpleSlave
+    */
+    typedef enum { NotSlave=0, SimpleSlave } SlaveTypes;
+
+    class ReplSettings {
+    public:
+        SlaveTypes slave;
+
+        /** true means we are master and doing replication.  if we are not writing to oplog, this won't be true. */
+        bool master;
+
+        bool fastsync;
+
+        bool autoresync;
+
+        int slavedelay;
+
+        set<string> discoveredSeeds;
+        mutex discoveredSeeds_mx;
+
+        BSONObj reconfig;
+
+        ReplSettings()
+            : slave(NotSlave),
+            master(false),
+            fastsync(),
+            autoresync(false),
+            slavedelay(),
+            discoveredSeeds(),
+            discoveredSeeds_mx("ReplSettings::discoveredSeeds") {
+        }
+
+    };
+
+    extern ReplSettings replSettings;
+
+    /* A replication exception */
+    class SyncException : public DBException {
+    public:
+        SyncException() : DBException( "sync exception" , 10001 ) {}
+    };
+
+    /* A Source is a source from which we can pull (replicate) data.
+       stored in collection local.sources.
+
+       Can be a group of things to replicate for several databases.
+
+          { host: ..., source: ..., only: ..., syncedTo: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } }
+
+       'source' defaults to 'main'; support for multiple source names is
+       not done (always use main for now).
+    */
+    class ReplSource {
+        shared_ptr<ThreadPool> tp;
+
+        void resync(string db);
+
+        /** @param alreadyLocked caller already put us in write lock if true */
+        void sync_pullOpLog_applyOperation(BSONObj& op, bool alreadyLocked);
+
+        /* pull some operations from the master's oplog, and apply them.
+           calls sync_pullOpLog_applyOperation
+        */
+        int sync_pullOpLog(int& nApplied);
+
+        /* we only clone one database per pass, even if a lot need done.  This helps us
+           avoid overflowing the master's transaction log by doing too much work before going
+           back to read more transactions. (Imagine a scenario of slave startup where we try to
+           clone 100 databases in one pass.)
+        */
+        set<string> addDbNextPass;
+
+        set<string> incompleteCloneDbs;
+
+        ReplSource();
+
+        // returns the dummy ns used to do the drop
+        string resyncDrop( const char *db, const char *requester );
+        // call without the db mutex
+        void syncToTailOfRemoteLog();
+        string ns() const { return string( "local.oplog.$" ) + sourceName(); }
+        unsigned _sleepAdviceTime;
+
+        /**
+         * If 'db' is a new database and its name would conflict with that of
+         * an existing database, synchronize these database names with the
+         * master.
+         * @return true iff an op with the specified ns may be applied.
+         */
+        bool handleDuplicateDbName( const BSONObj &op, const char *ns, const char *db );
+
+    public:
+        OplogReader oplogReader;
+
+        void applyOperation(const BSONObj& op);
+        string hostName;    // ip addr or hostname plus optionally, ":<port>"
+        string _sourceName;  // a logical source name.
+        string sourceName() const { return _sourceName.empty() ? "main" : _sourceName; }
+        string only; // only a certain db. note that in the sources collection, this may not be changed once you start replicating.
+
+        /* the last time point we have already synced up to (in the remote/master's oplog). */
+        OpTime syncedTo;
+
+        int nClonedThisPass;
+
+        typedef vector< shared_ptr< ReplSource > > SourceVector;
+        static void loadAll(SourceVector&);
+        explicit ReplSource(BSONObj);
+
+        /* -1 = error */
+        int sync(int& nApplied);
+
+        void save(); // write ourself to local.sources
+
+        // make a jsobj from our member fields of the form
+        //   { host: ..., source: ..., syncedTo: ... }
+        BSONObj jsobj();
+
+        bool operator==(const ReplSource&r) const {
+            return hostName == r.hostName && sourceName() == r.sourceName();
+        }
+        string toString() const { return sourceName() + "@" + hostName; }
+
+        bool haveMoreDbsToSync() const { return !addDbNextPass.empty(); }
+        int sleepAdvice() const {
+            if ( !_sleepAdviceTime )
+                return 0;
+            int wait = _sleepAdviceTime - unsigned( time( 0 ) );
+            return wait > 0 ? wait : 0;
+        }
+
+        static bool throttledForceResyncDead( const char *requester );
+        static void forceResyncDead( const char *requester );
+        void forceResync( const char *requester );
+    };
+
+    bool anyReplEnabled();
+    void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level = 0 );
+
+    /**
+     * Helper class used to set and query an ignore state for a named database.
+     * The ignore state will expire after a specified OpTime.
+     */
+    class DatabaseIgnorer {
+    public:
+        /** Indicate that operations for 'db' should be ignored until after 'futureOplogTime' */
+        void doIgnoreUntilAfter( const string &db, const OpTime &futureOplogTime );
+        /**
+         * Query ignore state of 'db'; if 'currentOplogTime' is after the ignore
+         * limit, the ignore state will be cleared.
+         */
+        bool ignoreAt( const string &db, const OpTime &currentOplogTime );
+    private:
+        map< string, OpTime > _ignores;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/repl/connections.h b/src/mongo/db/repl/connections.h
new file mode 100644
index 00000000000..3e08f80b047
--- /dev/null
+++ b/src/mongo/db/repl/connections.h
@@ -0,0 +1,128 @@
+// @file
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <map>
+#include "../../client/dbclient.h"
+#include "../security_common.h"
+
+namespace mongo {
+
+    /** here we keep a single connection (with reconnect) for a set of hosts,
+        one each, and allow one user at a time per host.  if in use already for that
+        host, we block.  so this is an easy way to keep a 1-deep pool of connections
+        that many threads can share.
+
+        thread-safe.
+
+        Example:
+        {
+            ScopedConn c("foo.acme.com:9999");
+            c->runCommand(...);
+        }
+
+        throws exception on connect error (but fine to try again later with a new
+        scopedconn object for same host).
+    */
+    class ScopedConn {
+    public:
+        /** throws assertions if connect failure etc. */
+        ScopedConn(string hostport);
+        ~ScopedConn() {
+            // conLock releases...
+        }
+        void reconnect() {
+          conn()->port().shutdown();
+          connect();
+        }
+
+        /* If we were to run a query and not exhaust the cursor, future use of the connection would be problematic.
+           So here what we do is wrapper known safe methods and not allow cursor-style queries at all.  This makes
+           ScopedConn limited in functionality but very safe.  More non-cursor wrappers can be added here if needed.
+           */
+        bool runCommand(const string &dbname, const BSONObj& cmd, BSONObj &info, int options=0) {
+            return conn()->runCommand(dbname, cmd, info, options);
+        }
+        unsigned long long count(const string &ns) {
+            return conn()->count(ns);
+        }
+        BSONObj findOne(const string &ns, const Query& q, const BSONObj *fieldsToReturn = 0, int queryOptions = 0) {
+            return conn()->findOne(ns, q, fieldsToReturn, queryOptions);
+        }
+
+    private:
+        auto_ptr<scoped_lock> connLock;
+        static mongo::mutex mapMutex;
+        struct X {
+            mongo::mutex z;
+            DBClientConnection cc;
+            bool connected;
+            X() : z("X"), cc(/*reconnect*/ true, 0, /*timeout*/ 10.0), connected(false) {
+                cc._logLevel = 2;
+            }
+        } *x;
+        typedef map<string,ScopedConn::X*> M;
+        static M& _map;
+        DBClientConnection* conn() { return &x->cc; }
+        const string _hostport;
+
+        // we should already be locked...
+        bool connect() {
+          string err;
+          if (!x->cc.connect(_hostport, err)) {
+            log() << "couldn't connect to " << _hostport << ": " << err << rsLog;
+            return false;
+          }
+          x->connected = true;
+
+          // if we cannot authenticate against a member, then either its key file
+          // or our key file has to change.  if our key file has to change, we'll
+          // be rebooting. if their file has to change, they'll be rebooted so the
+          // connection created above will go dead, reconnect, and reauth.
+          if (!noauth && !x->cc.auth("local", internalSecurity.user, internalSecurity.pwd, err, false)) {
+            log() << "could not authenticate against " << _hostport << ", " << err << rsLog;
+            return false;
+          }
+
+          return true;
+        }
+    };
+
+    inline ScopedConn::ScopedConn(string hostport) : _hostport(hostport) {
+        bool first = false;
+        {
+            scoped_lock lk(mapMutex);
+            x = _map[_hostport];
+            if( x == 0 ) {
+                x = _map[_hostport] = new X();
+                first = true;
+                connLock.reset( new scoped_lock(x->z) );
+            }
+        }
+
+        // Keep trying to connect if we're not yet connected
+        if( !first && x->connected ) {
+            connLock.reset( new scoped_lock(x->z) );
+            return;
+        }
+
+        connect();
+    }
+
+}
diff --git a/src/mongo/db/repl/consensus.cpp b/src/mongo/db/repl/consensus.cpp
new file mode 100644
index 00000000000..3995373f5ef
--- /dev/null
+++ b/src/mongo/db/repl/consensus.cpp
@@ -0,0 +1,449 @@
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../commands.h"
+#include "rs.h"
+#include "multicmd.h"
+
+namespace mongo {
+
+    class CmdReplSetFresh : public ReplSetCommand {
+    public:
+        CmdReplSetFresh() : ReplSetCommand("replSetFresh") { }
+    private:
+
+        bool shouldVeto(const BSONObj& cmdObj, string& errmsg) {
+            unsigned id = cmdObj["id"].Int();
+            const Member* primary = theReplSet->box.getPrimary();
+            const Member* hopeful = theReplSet->findById(id);
+            const Member *highestPriority = theReplSet->getMostElectable();
+
+            if( !hopeful ) {
+                errmsg = str::stream() << "replSet couldn't find member with id " << id;
+                return true;
+            }
+            else if( theReplSet->isPrimary() && theReplSet->lastOpTimeWritten >= hopeful->hbinfo().opTime ) {
+                // hbinfo is not updated, so we have to check the primary's last optime separately
+                errmsg = str::stream() << "I am already primary, " << hopeful->fullName() <<
+                    " can try again once I've stepped down";
+                return true;
+            }
+            else if( primary && primary->hbinfo().opTime >= hopeful->hbinfo().opTime ) {
+                // other members might be aware of more up-to-date nodes
+                errmsg = str::stream() << hopeful->fullName() << " is trying to elect itself but " <<
+                    primary->fullName() << " is already primary and more up-to-date";
+                return true;
+            }
+            else if( highestPriority && highestPriority->config().priority > hopeful->config().priority) {
+                errmsg = str::stream() << hopeful->fullName() << " has lower priority than " << highestPriority->fullName();
+                return true;
+            }
+
+            // don't veto older versions
+            if (cmdObj["id"].eoo()) {
+                // they won't be looking for the veto field
+                return false;
+            }
+
+            if ( !theReplSet->isElectable(id) ||
+                (highestPriority && highestPriority->config().priority > hopeful->config().priority)) {
+                return true;
+            }
+
+            return false;
+        }
+
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( !check(errmsg, result) )
+                return false;
+
+            if( cmdObj["set"].String() != theReplSet->name() ) {
+                errmsg = "wrong repl set name";
+                return false;
+            }
+            string who = cmdObj["who"].String();
+            int cfgver = cmdObj["cfgver"].Int();
+            OpTime opTime(cmdObj["opTime"].Date());
+
+            bool weAreFresher = false;
+            if( theReplSet->config().version > cfgver ) {
+                log() << "replSet member " << who << " is not yet aware its cfg version " << cfgver << " is stale" << rsLog;
+                result.append("info", "config version stale");
+                weAreFresher = true;
+            }
+            // check not only our own optime, but any other member we can reach
+            else if( opTime < theReplSet->lastOpTimeWritten ||
+                     opTime < theReplSet->lastOtherOpTime())  {
+                weAreFresher = true;
+            }
+            result.appendDate("opTime", theReplSet->lastOpTimeWritten.asDate());
+            result.append("fresher", weAreFresher);
+            result.append("veto", shouldVeto(cmdObj, errmsg));
+
+            return true;
+        }
+    } cmdReplSetFresh;
+
+    class CmdReplSetElect : public ReplSetCommand {
+    public:
+        CmdReplSetElect() : ReplSetCommand("replSetElect") { }
+    private:
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( !check(errmsg, result) )
+                return false;
+            theReplSet->elect.electCmdReceived(cmdObj, &result);
+            return true;
+        }
+    } cmdReplSetElect;
+
+    int Consensus::totalVotes() const {
+        static int complain = 0;
+        int vTot = rs._self->config().votes;
+        for( Member *m = rs.head(); m; m=m->next() )
+            vTot += m->config().votes;
+        if( vTot % 2 == 0 && vTot && complain++ == 0 )
+            log() << "replSet " /*buildbot! warning */ "total number of votes is even - add arbiter or give one member an extra vote" << rsLog;
+        return vTot;
+    }
+
+    bool Consensus::aMajoritySeemsToBeUp() const {
+        int vUp = rs._self->config().votes;
+        for( Member *m = rs.head(); m; m=m->next() )
+            vUp += m->hbinfo().up() ? m->config().votes : 0;
+        return vUp * 2 > totalVotes();
+    }
+
+    bool Consensus::shouldRelinquish() const {
+        int vUp = rs._self->config().votes;
+        const long long T = rs.config().ho.heartbeatTimeoutMillis * rs.config().ho.heartbeatConnRetries;
+        for( Member *m = rs.head(); m; m=m->next() ) {
+            long long dt = m->hbinfo().timeDown();
+            if( dt < T )
+                vUp += m->config().votes;
+        }
+
+        // the manager will handle calling stepdown if another node should be
+        // primary due to priority
+
+        return !( vUp * 2 > totalVotes() );
+    }
+
+    static const int VETO = -10000;
+
+    const time_t LeaseTime = 30;
+
+    SimpleMutex Consensus::lyMutex("ly");
+
+    unsigned Consensus::yea(unsigned memberId) { /* throws VoteException */
+        SimpleMutex::scoped_lock lk(lyMutex);
+        LastYea &L = this->ly.ref(lk);
+        time_t now = time(0);
+        if( L.when + LeaseTime >= now && L.who != memberId ) {
+            LOG(1) << "replSet not voting yea for " << memberId <<
+                   " voted for " << L.who << ' ' << now-L.when << " secs ago" << rsLog;
+            throw VoteException();
+        }
+        L.when = now;
+        L.who = memberId;
+        return rs._self->config().votes;
+    }
+
+    /* we vote for ourself at start of election.  once it fails, we can cancel the lease we had in
+       place instead of leaving it for a long time.
+       */
+    void Consensus::electionFailed(unsigned meid) {
+        SimpleMutex::scoped_lock lk(lyMutex);
+        LastYea &L = ly.ref(lk);
+        DEV assert( L.who == meid ); // this may not always always hold, so be aware, but adding for now as a quick sanity test
+        if( L.who == meid )
+            L.when = 0;
+    }
+
+    /* todo: threading **************** !!!!!!!!!!!!!!!! */
+    void Consensus::electCmdReceived(BSONObj cmd, BSONObjBuilder* _b) {
+        BSONObjBuilder& b = *_b;
+        DEV log() << "replSet received elect msg " << cmd.toString() << rsLog;
+        else LOG(2) << "replSet received elect msg " << cmd.toString() << rsLog;
+        string set = cmd["set"].String();
+        unsigned whoid = cmd["whoid"].Int();
+        int cfgver = cmd["cfgver"].Int();
+        OID round = cmd["round"].OID();
+        int myver = rs.config().version;
+
+        const Member* primary = rs.box.getPrimary();
+        const Member* hopeful = rs.findById(whoid);
+        const Member* highestPriority = rs.getMostElectable();
+
+        int vote = 0;
+        if( set != rs.name() ) {
+            log() << "replSet error received an elect request for '" << set << "' but our set name is '" << rs.name() << "'" << rsLog;
+        }
+        else if( myver < cfgver ) {
+            // we are stale.  don't vote
+        }
+        else if( myver > cfgver ) {
+            // they are stale!
+            log() << "replSet electCmdReceived info got stale version # during election" << rsLog;
+            vote = -10000;
+        }
+        else if( !hopeful ) {
+            log() << "replSet electCmdReceived couldn't find member with id " << whoid << rsLog;
+            vote = -10000;
+        }
+        else if( primary && primary == rs._self && rs.lastOpTimeWritten >= hopeful->hbinfo().opTime ) {
+            // hbinfo is not updated, so we have to check the primary's last optime separately
+            log() << "I am already primary, " << hopeful->fullName()
+                  << " can try again once I've stepped down" << rsLog;
+            vote = -10000;
+        }
+        else if( primary && primary->hbinfo().opTime >= hopeful->hbinfo().opTime ) {
+            // other members might be aware of more up-to-date nodes
+            log() << hopeful->fullName() << " is trying to elect itself but " <<
+                  primary->fullName() << " is already primary and more up-to-date" << rsLog;
+            vote = -10000;
+        }
+        else if( highestPriority && highestPriority->config().priority > hopeful->config().priority) {
+            log() << hopeful->fullName() << " has lower priority than " << highestPriority->fullName();
+            vote = -10000;
+        }
+        else {
+            try {
+                vote = yea(whoid);
+                dassert( hopeful->id() == whoid );
+                rs.relinquish();
+                log() << "replSet info voting yea for " <<  hopeful->fullName() << " (" << whoid << ')' << rsLog;
+            }
+            catch(VoteException&) {
+                log() << "replSet voting no for " << hopeful->fullName() << " already voted for another" << rsLog;
+            }
+        }
+
+        b.append("vote", vote);
+        b.append("round", round);
+    }
+
+    void ReplSetImpl::_getTargets(list<Target>& L, int& configVersion) {
+        configVersion = config().version;
+        for( Member *m = head(); m; m=m->next() )
+            if( m->hbinfo().maybeUp() )
+                L.push_back( Target(m->fullName()) );
+    }
+
+    /* config version is returned as it is ok to use this unlocked.  BUT, if unlocked, you would need
+       to check later that the config didn't change. */
+    void ReplSetImpl::getTargets(list<Target>& L, int& configVersion) {
+        if( lockedByMe() ) {
+            _getTargets(L, configVersion);
+            return;
+        }
+        lock lk(this);
+        _getTargets(L, configVersion);
+    }
+
+    /* Do we have the newest data of them all?
+       @param allUp - set to true if all members are up.  Only set if true returned.
+       @return true if we are freshest.  Note we may tie.
+    */
+    bool Consensus::weAreFreshest(bool& allUp, int& nTies) {
+        const OpTime ord = theReplSet->lastOpTimeWritten;
+        nTies = 0;
+        assert( !ord.isNull() );
+        BSONObj cmd = BSON(
+                          "replSetFresh" << 1 <<
+                          "set" << rs.name() <<
+                          "opTime" << Date_t(ord.asDate()) <<
+                          "who" << rs._self->fullName() <<
+                          "cfgver" << rs._cfg->version <<
+                          "id" << rs._self->id());
+        list<Target> L;
+        int ver;
+        /* the following queries arbiters, even though they are never fresh.  wonder if that makes sense.
+           it doesn't, but it could, if they "know" what freshness it one day.  so consider removing
+           arbiters from getTargets() here.  although getTargets is used elsewhere for elections; there
+           arbiters are certainly targets - so a "includeArbs" bool would be necessary if we want to make
+           not fetching them herein happen.
+           */
+        rs.getTargets(L, ver);
+        multiCommand(cmd, L);
+        int nok = 0;
+        allUp = true;
+        for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
+            if( i->ok ) {
+                nok++;
+                if( i->result["fresher"].trueValue() ) {
+                    log() << "not electing self, we are not freshest" << rsLog;
+                    return false;
+                }
+                OpTime remoteOrd( i->result["opTime"].Date() );
+                if( remoteOrd == ord )
+                    nTies++;
+                assert( remoteOrd <= ord );
+
+                if( i->result["veto"].trueValue() ) {
+                    BSONElement msg = i->result["errmsg"];
+                    if (!msg.eoo()) {
+                        log() << "not electing self, " << i->toHost << " would veto with '" <<
+                            msg.String() << "'" << rsLog;
+                    }
+                    else {
+                        log() << "not electing self, " << i->toHost << " would veto" << rsLog;
+                    }
+                    return false;
+                }
+            }
+            else {
+                DEV log() << "replSet freshest returns " << i->result.toString() << rsLog;
+                allUp = false;
+            }
+        }
+        LOG(1) << "replSet dev we are freshest of up nodes, nok:" << nok << " nTies:" << nTies << rsLog;
+        assert( ord <= theReplSet->lastOpTimeWritten ); // <= as this may change while we are working...
+        return true;
+    }
+
+    extern time_t started;
+
+    void Consensus::multiCommand(BSONObj cmd, list<Target>& L) {
+        assert( !rs.lockedByMe() );
+        mongo::multiCommand(cmd, L);
+    }
+
+    void Consensus::_electSelf() {
+        if( time(0) < steppedDown )
+            return;
+
+        {
+            const OpTime ord = theReplSet->lastOpTimeWritten;
+            if( ord == 0 ) {
+                log() << "replSet info not trying to elect self, do not yet have a complete set of data from any point in time" << rsLog;
+                return;
+            }
+        }
+
+        bool allUp;
+        int nTies;
+        if( !weAreFreshest(allUp, nTies) ) {
+            return;
+        }
+
+        rs.sethbmsg("",9);
+
+        if( !allUp && time(0) - started < 60 * 5 ) {
+            /* the idea here is that if a bunch of nodes bounce all at once, we don't want to drop data
+               if we don't have to -- we'd rather be offline and wait a little longer instead
+               todo: make this configurable.
+               */
+            rs.sethbmsg("not electing self, not all members up and we have been up less than 5 minutes");
+            return;
+        }
+
+        Member& me = *rs._self;
+
+        if( nTies ) {
+            /* tie?  we then randomly sleep to try to not collide on our voting. */
+            /* todo: smarter. */
+            if( me.id() == 0 || sleptLast ) {
+                // would be fine for one node not to sleep
+                // todo: biggest / highest priority nodes should be the ones that get to not sleep
+            }
+            else {
+                assert( !rs.lockedByMe() ); // bad to go to sleep locked
+                unsigned ms = ((unsigned) rand()) % 1000 + 50;
+                DEV log() << "replSet tie " << nTies << " sleeping a little " << ms << "ms" << rsLog;
+                sleptLast = true;
+                sleepmillis(ms);
+                throw RetryAfterSleepException();
+            }
+        }
+        sleptLast = false;
+
+        time_t start = time(0);
+        unsigned meid = me.id();
+        int tally = yea( meid );
+        bool success = false;
+        try {
+            log() << "replSet info electSelf " << meid << rsLog;
+
+            BSONObj electCmd = BSON(
+                                   "replSetElect" << 1 <<
+                                   "set" << rs.name() <<
+                                   "who" << me.fullName() <<
+                                   "whoid" << me.hbinfo().id() <<
+                                   "cfgver" << rs._cfg->version <<
+                                   "round" << OID::gen() /* this is just for diagnostics */
+                               );
+
+            int configVersion;
+            list<Target> L;
+            rs.getTargets(L, configVersion);
+            multiCommand(electCmd, L);
+
+            {
+                for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
+                    DEV log() << "replSet elect res: " << i->result.toString() << rsLog;
+                    if( i->ok ) {
+                        int v = i->result["vote"].Int();
+                        tally += v;
+                    }
+                }
+                if( tally*2 <= totalVotes() ) {
+                    log() << "replSet couldn't elect self, only received " << tally << " votes" << rsLog;
+                }
+                else if( time(0) - start > 30 ) {
+                    // defensive; should never happen as we have timeouts on connection and operation for our conn
+                    log() << "replSet too much time passed during our election, ignoring result" << rsLog;
+                }
+                else if( configVersion != rs.config().version ) {
+                    log() << "replSet config version changed during our election, ignoring result" << rsLog;
+                }
+                else {
+                    /* succeeded. */
+                    log(1) << "replSet election succeeded, assuming primary role" << rsLog;
+                    success = true;
+                    rs.assumePrimary();
+                }
+            }
+        }
+        catch( std::exception& ) {
+            if( !success ) electionFailed(meid);
+            throw;
+        }
+        if( !success ) electionFailed(meid);
+    }
+
+    void Consensus::electSelf() {
+        assert( !rs.lockedByMe() );
+        assert( !rs.myConfig().arbiterOnly );
+        assert( rs.myConfig().slaveDelay == 0 );
+        try {
+            _electSelf();
+        }
+        catch(RetryAfterSleepException&) {
+            throw;
+        }
+        catch(VoteException& ) {
+            log() << "replSet not trying to elect self as responded yea to someone else recently" << rsLog;
+        }
+        catch(DBException& e) {
+            log() << "replSet warning caught unexpected exception in electSelf() " << e.toString() << rsLog;
+        }
+        catch(...) {
+            log() << "replSet warning caught unexpected exception in electSelf()" << rsLog;
+        }
+    }
+
+}
diff --git a/src/mongo/db/repl/health.cpp b/src/mongo/db/repl/health.cpp
new file mode 100644
index 00000000000..0b7ed87eac3
--- /dev/null
+++ b/src/mongo/db/repl/health.cpp
@@ -0,0 +1,449 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,b
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "rs.h"
+#include "health.h"
+#include "../../util/background.h"
+#include "../../client/dbclient.h"
+#include "../../client/connpool.h"
+#include "../commands.h"
+#include "../../util/concurrency/value.h"
+#include "../../util/concurrency/task.h"
+#include "../../util/mongoutils/html.h"
+#include "../../util/goodies.h"
+#include "../../util/ramlog.h"
+#include "../helpers/dblogger.h"
+#include "connections.h"
+#include "../../util/unittest.h"
+#include "../dbhelpers.h"
+
+namespace mongo {
+    /* decls for connections.h */
+    ScopedConn::M& ScopedConn::_map = *(new ScopedConn::M());
+    mutex ScopedConn::mapMutex("ScopedConn::mapMutex");
+}
+
+namespace mongo {
+
+    using namespace mongoutils::html;
+    using namespace bson;
+
+    static RamLog * _rsLog = new RamLog( "rs" );
+    Tee *rsLog = _rsLog;
+    extern bool replSetBlind; // for testing
+
+    string ago(time_t t) {
+        if( t == 0 ) return "";
+
+        time_t x = time(0) - t;
+        stringstream s;
+        if( x < 180 ) {
+            s << x << " sec";
+            if( x != 1 ) s << 's';
+        }
+        else if( x < 3600 ) {
+            s.precision(2);
+            s << x / 60.0 << " mins";
+        }
+        else {
+            s.precision(2);
+            s << x / 3600.0 << " hrs";
+        }
+        return s.str();
+    }
+
+    void Member::summarizeMember(stringstream& s) const {
+        s << tr();
+        {
+            stringstream u;
+            u << "http://" << h().host() << ':' << (h().port() + 1000) << "/_replSet";
+            s << td( a(u.str(), "", fullName()) );
+        }
+        s << td( id() );
+        double h = hbinfo().health;
+        bool ok = h > 0;
+        s << td(red(str::stream() << h,h == 0));
+        s << td(ago(hbinfo().upSince));
+        bool never = false;
+        {
+            string h;
+            time_t hb = hbinfo().lastHeartbeat;
+            if( hb == 0 ) {
+                h = "never";
+                never = true;
+            }
+            else h = ago(hb) + " ago";
+            s << td(h);
+        }
+        s << td(config().votes);
+        s << td(config().priority);
+        {
+            string stateText = state().toString();
+            if( _config.hidden )
+                stateText += " (hidden)";
+            if( ok || stateText.empty() )
+                s << td(stateText); // text blank if we've never connected
+            else
+                s << td( grey(str::stream() << "(was " << state().toString() << ')', true) );
+        }
+        s << td( grey(hbinfo().lastHeartbeatMsg,!ok) );
+        stringstream q;
+        q << "/_replSetOplog?_id=" << id();
+        s << td( a(q.str(), "", never ? "?" : hbinfo().opTime.toString()) );
+        if( hbinfo().skew > INT_MIN ) {
+            s << td( grey(str::stream() << hbinfo().skew,!ok) );
+        }
+        else
+            s << td("");
+        s << _tr();
+    }
+
+    string ReplSetImpl::stateAsHtml(MemberState s) {
+        if( s.s == MemberState::RS_STARTUP ) return a("", "serving still starting up, or still trying to initiate the set", "STARTUP");
+        if( s.s == MemberState::RS_PRIMARY ) return a("", "this server thinks it is primary", "PRIMARY");
+        if( s.s == MemberState::RS_SECONDARY ) return a("", "this server thinks it is a secondary (slave mode)", "SECONDARY");
+        if( s.s == MemberState::RS_RECOVERING ) return a("", "recovering/resyncing; after recovery usually auto-transitions to secondary", "RECOVERING");
+        if( s.s == MemberState::RS_FATAL ) return a("", "something bad has occurred and server is not completely offline with regard to the replica set.  fatal error.", "FATAL");
+        if( s.s == MemberState::RS_STARTUP2 ) return a("", "loaded config, still determining who is primary", "STARTUP2");
+        if( s.s == MemberState::RS_ARBITER ) return a("", "this server is an arbiter only", "ARBITER");
+        if( s.s == MemberState::RS_DOWN ) return a("", "member is down, slow, or unreachable", "DOWN");
+        if( s.s == MemberState::RS_ROLLBACK ) return a("", "rolling back operations to get in sync", "ROLLBACK");
+        return "";
+    }
+
+    extern time_t started;
+
+    // oplogdiags in web ui
+    static void say(stringstream&ss, const bo& op) {
+        ss << "<tr>";
+
+        set<string> skip;
+        be e = op["ts"];
+        if( e.type() == Date || e.type() == Timestamp ) {
+            OpTime ot = e._opTime();
+            ss << td( time_t_to_String_short( ot.getSecs() ) );
+            ss << td( ot.toString() );
+            skip.insert("ts");
+        }
+        else ss << td("?") << td("?");
+
+        e = op["h"];
+        if( e.type() == NumberLong ) {
+            ss << "<td>" << hex << e.Long() << "</td>\n";
+            skip.insert("h");
+        }
+        else
+            ss << td("?");
+
+        ss << td(op["op"].valuestrsafe());
+        ss << td(op["ns"].valuestrsafe());
+        skip.insert("op");
+        skip.insert("ns");
+
+        ss << "<td>";
+        for( bo::iterator i(op); i.more(); ) {
+            be e = i.next();
+            if( skip.count(e.fieldName()) ) continue;
+            ss << e.toString() << ' ';
+        }
+        ss << "</td></tr>\n";
+    }
+
+    void ReplSetImpl::_getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const {
+        const Member *m = findById(server_id);
+        if( m == 0 ) {
+            ss << "Error : can't find a member with id: " << server_id << '\n';
+            return;
+        }
+
+        ss << p("Server : " + m->fullName() + "<br>ns : " + rsoplog );
+
+        //const bo fields = BSON( "o" << false << "o2" << false );
+        const bo fields;
+
+        /** todo fix we might want an so timeout here */
+        DBClientConnection conn(false, 0, /*timeout*/ 20);
+        {
+            string errmsg;
+            if( !conn.connect(m->fullName(), errmsg) ) {
+                ss << "couldn't connect to " << m->fullName() << ' ' << errmsg;
+                return;
+            }
+        }
+
+        auto_ptr<DBClientCursor> c = conn.query(rsoplog, Query().sort("$natural",1), 20, 0, &fields);
+        if( c.get() == 0 ) {
+            ss << "couldn't query " << rsoplog;
+            return;
+        }
+        static const char *h[] = {"ts","optime", "h","op","ns","rest",0};
+
+        ss << "<style type=\"text/css\" media=\"screen\">"
+           "table { font-size:75% }\n"
+           // "th { background-color:#bbb; color:#000 }\n"
+           // "td,th { padding:.25em }\n"
+           "</style>\n";
+
+        ss << table(h, true);
+        //ss << "<pre>\n";
+        int n = 0;
+        OpTime otFirst;
+        OpTime otLast;
+        OpTime otEnd;
+        while( c->more() ) {
+            bo o = c->next();
+            otLast = o["ts"]._opTime();
+            if( otFirst.isNull() )
+                otFirst = otLast;
+            say(ss, o);
+            n++;
+        }
+        if( n == 0 ) {
+            ss << rsoplog << " is empty\n";
+        }
+        else {
+            auto_ptr<DBClientCursor> c = conn.query(rsoplog, Query().sort("$natural",-1), 20, 0, &fields);
+            if( c.get() == 0 ) {
+                ss << "couldn't query [2] " << rsoplog;
+                return;
+            }
+            string x;
+            bo o = c->next();
+            otEnd = o["ts"]._opTime();
+            while( 1 ) {
+                stringstream z;
+                if( o["ts"]._opTime() == otLast )
+                    break;
+                say(z, o);
+                x = z.str() + x;
+                if( !c->more() )
+                    break;
+                o = c->next();
+            }
+            if( !x.empty() ) {
+                ss << "<tr><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td></tr>\n" << x;
+                //ss << "\n...\n\n" << x;
+            }
+        }
+        ss << _table();
+        ss << p(time_t_to_String_short(time(0)) + " current time");
+
+        if( !otEnd.isNull() ) {
+            ss << "<p>Log length in time: ";
+            unsigned d = otEnd.getSecs() - otFirst.getSecs();
+            double h = d / 3600.0;
+            ss.precision(3);
+            if( h < 72 )
+                ss << h << " hours";
+            else
+                ss << h / 24.0 << " days";
+            ss << "</p>\n";
+        }
+    }
+
+    void ReplSetImpl::_summarizeAsHtml(stringstream& s) const {
+        s << table(0, false);
+        s << tr("Set name:", _name);
+        s << tr("Majority up:", elect.aMajoritySeemsToBeUp()?"yes":"no" );
+        s << _table();
+
+        const char *h[] = {"Member",
+                           "<a title=\"member id in the replset config\">id</a>",
+                           "Up",
+                           "<a title=\"length of time we have been continuously connected to the other member with no reconnects (for self, shows uptime)\">cctime</a>",
+                           "<a title=\"when this server last received a heartbeat response - includes error code responses\">Last heartbeat</a>",
+                           "Votes", "Priority", "State", "Messages",
+                           "<a title=\"how up to date this server is.  this value polled every few seconds so actually lag is typically much lower than value shown here.\">optime</a>",
+                           "<a title=\"Clock skew in seconds relative to this server. Informational; server clock variances will make the diagnostics hard to read, but otherwise are benign..\">skew</a>",
+                           0
+                          };
+        s << table(h);
+
+        /* this is to sort the member rows by their ordinal _id, so they show up in the same
+           order on all the different web ui's; that is less confusing for the operator. */
+        map<int,string> mp;
+
+        string myMinValid;
+        try {
+            readlocktry lk("local.replset.minvalid", 300);
+            if( lk.got() ) {
+                BSONObj mv;
+                if( Helpers::getSingleton("local.replset.minvalid", mv) ) {
+                    myMinValid = "minvalid:" + mv["ts"]._opTime().toString();
+                }
+            }
+            else myMinValid = ".";
+        }
+        catch(...) {
+            myMinValid = "exception fetching minvalid";
+        }
+
+        const Member *_self = this->_self;
+        assert(_self);
+        {
+            stringstream s;
+            /* self row */
+            s << tr() << td(_self->fullName() + " (me)") <<
+              td(_self->id()) <<
+              td("1") <<  //up
+              td(ago(started)) <<
+              td("") << // last heartbeat
+              td(ToString(_self->config().votes)) <<
+              td(ToString(_self->config().priority)) <<
+              td( stateAsHtml(box.getState()) + (_self->config().hidden?" (hidden)":"") );
+            s << td( _hbmsg );
+            stringstream q;
+            q << "/_replSetOplog?_id=" << _self->id();
+            s << td( a(q.str(), myMinValid, theReplSet->lastOpTimeWritten.toString()) );
+            s << td(""); // skew
+            s << _tr();
+            mp[_self->hbinfo().id()] = s.str();
+        }
+        Member *m = head();
+        while( m ) {
+            stringstream s;
+            m->summarizeMember(s);
+            mp[m->hbinfo().id()] = s.str();
+            m = m->next();
+        }
+
+        for( map<int,string>::const_iterator i = mp.begin(); i != mp.end(); i++ )
+            s << i->second;
+        s << _table();
+    }
+
+
+    void fillRsLog(stringstream& s) {
+        _rsLog->toHTML( s );
+    }
+
+    const Member* ReplSetImpl::findById(unsigned id) const {
+        if( _self && id == _self->id() ) return _self;
+
+        for( Member *m = head(); m; m = m->next() )
+            if( m->id() == id )
+                return m;
+        return 0;
+    }
+
+    const OpTime ReplSetImpl::lastOtherOpTime() const {
+        OpTime closest(0,0);
+
+        for( Member *m = _members.head(); m; m=m->next() ) {
+            if (!m->hbinfo().up()) {
+                continue;
+            }
+
+            if (m->hbinfo().opTime > closest) {
+                closest = m->hbinfo().opTime;
+            }
+        }
+
+        return closest;
+    }
+
+    void ReplSetImpl::_summarizeStatus(BSONObjBuilder& b) const {
+        vector<BSONObj> v;
+
+        const Member *_self = this->_self;
+        assert( _self );
+
+        MemberState myState = box.getState();
+
+        // add self
+        {
+            BSONObjBuilder bb;
+            bb.append("_id", (int) _self->id());
+            bb.append("name", _self->fullName());
+            bb.append("health", 1.0);
+            bb.append("state", (int)myState.s);
+            bb.append("stateStr", myState.toString());
+            bb.append("uptime", (unsigned)(time(0) - cmdLine.started));
+            if (!_self->config().arbiterOnly) {
+                bb.appendTimestamp("optime", lastOpTimeWritten.asDate());
+                bb.appendDate("optimeDate", lastOpTimeWritten.getSecs() * 1000LL);
+            }
+
+            int maintenance = _maintenanceMode;
+            if (maintenance) {
+                bb.append("maintenanceMode", maintenance);
+            }
+
+            if (theReplSet) {
+                string s = theReplSet->hbmsg();
+                if( !s.empty() )
+                    bb.append("errmsg", s);
+            }
+            bb.append("self", true);
+            v.push_back(bb.obj());
+        }
+
+        Member *m =_members.head();
+        while( m ) {
+            BSONObjBuilder bb;
+            bb.append("_id", (int) m->id());
+            bb.append("name", m->fullName());
+            double h = m->hbinfo().health;
+            bb.append("health", h);
+            bb.append("state", (int) m->state().s);
+            if( h == 0 ) {
+                // if we can't connect the state info is from the past and could be confusing to show
+                bb.append("stateStr", "(not reachable/healthy)");
+            }
+            else {
+                bb.append("stateStr", m->state().toString());
+            }
+            bb.append("uptime", (unsigned) (m->hbinfo().upSince ? (time(0)-m->hbinfo().upSince) : 0));
+            if (!m->config().arbiterOnly) {
+                bb.appendTimestamp("optime", m->hbinfo().opTime.asDate());
+                bb.appendDate("optimeDate", m->hbinfo().opTime.getSecs() * 1000LL);
+            }
+            bb.appendTimeT("lastHeartbeat", m->hbinfo().lastHeartbeat);
+            bb.append("pingMs", m->hbinfo().ping);
+            string s = m->lhb();
+            if( !s.empty() )
+                bb.append("errmsg", s);
+
+            if (m->hbinfo().authIssue) {
+                bb.append("authenticated", false);
+            }
+
+            v.push_back(bb.obj());
+            m = m->next();
+        }
+        sort(v.begin(), v.end());
+        b.append("set", name());
+        b.appendTimeT("date", time(0));
+        b.append("myState", myState.s);
+        const Member *syncTarget = _currentSyncTarget;
+        if (syncTarget && myState != MemberState::RS_PRIMARY) {
+            b.append("syncingTo", syncTarget->fullName());
+        }
+        b.append("members", v);
+        if( replSetBlind )
+            b.append("blind",true); // to avoid confusion if set...normally never set except for testing.
+    }
+
+    static struct Test : public UnitTest {
+        void run() {
+            HealthOptions a,b;
+            assert( a == b );
+            assert( a.isDefault() );
+        }
+    } test;
+
+}
diff --git a/src/mongo/db/repl/health.h b/src/mongo/db/repl/health.h
new file mode 100644
index 00000000000..55cca93a27e
--- /dev/null
+++ b/src/mongo/db/repl/health.h
@@ -0,0 +1,50 @@
+// replset.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    /* throws */
+    bool requestHeartbeat(string setname, string fromHost, string memberFullName, BSONObj& result, int myConfigVersion, int& theirConfigVersion, bool checkEmpty = false);
+
+    struct HealthOptions {
+        HealthOptions() :  
+            heartbeatSleepMillis(2000), 
+            heartbeatTimeoutMillis( 10000 ),
+            heartbeatConnRetries(2) 
+        { }
+
+        bool isDefault() const { return *this == HealthOptions(); }
+
+        // see http://www.mongodb.org/display/DOCS/Replica+Set+Internals
+        unsigned heartbeatSleepMillis;
+        unsigned heartbeatTimeoutMillis;
+        unsigned heartbeatConnRetries ;
+
+        void check() {
+            uassert(13112, "bad replset heartbeat option", heartbeatSleepMillis >= 10);
+            uassert(13113, "bad replset heartbeat option", heartbeatTimeoutMillis >= 10);
+        }
+
+        bool operator==(const HealthOptions& r) const {
+            return heartbeatSleepMillis==r.heartbeatSleepMillis && heartbeatTimeoutMillis==r.heartbeatTimeoutMillis && heartbeatConnRetries==r.heartbeatConnRetries;
+        }
+    };
+    
+}
diff --git a/src/mongo/db/repl/heartbeat.cpp b/src/mongo/db/repl/heartbeat.cpp
new file mode 100644
index 00000000000..331812af85a
--- /dev/null
+++ b/src/mongo/db/repl/heartbeat.cpp
@@ -0,0 +1,382 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,b
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "rs.h"
+#include "health.h"
+#include "../../util/background.h"
+#include "../../client/dbclient.h"
+#include "../commands.h"
+#include "../../util/concurrency/value.h"
+#include "../../util/concurrency/task.h"
+#include "../../util/concurrency/msg.h"
+#include "../../util/mongoutils/html.h"
+#include "../../util/goodies.h"
+#include "../../util/ramlog.h"
+#include "../helpers/dblogger.h"
+#include "connections.h"
+#include "../../util/unittest.h"
+#include "../instance.h"
+#include "../repl.h"
+
+namespace mongo {
+
+    using namespace bson;
+
+    extern bool replSetBlind;
+    extern ReplSettings replSettings;
+
+    unsigned int HeartbeatInfo::numPings;
+
+    long long HeartbeatInfo::timeDown() const {
+        if( up() ) return 0;
+        if( downSince == 0 )
+            return 0; // still waiting on first heartbeat
+        return jsTime() - downSince;
+    }
+
+    /* { replSetHeartbeat : <setname> } */
+    class CmdReplSetHeartbeat : public ReplSetCommand {
+    public:
+        CmdReplSetHeartbeat() : ReplSetCommand("replSetHeartbeat") { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( replSetBlind ) {
+                if (theReplSet) {
+                    errmsg = str::stream() << theReplSet->selfFullName() << " is blind";
+                }
+                return false;
+            }
+
+            /* we don't call ReplSetCommand::check() here because heartbeat
+               checks many things that are pre-initialization. */
+            if( !replSet ) {
+                errmsg = "not running with --replSet";
+                return false;
+            }
+
+            if (!checkAuth(errmsg, result)) {
+                return false;
+            }
+
+            /* we want to keep heartbeat connections open when relinquishing primary.  tag them here. */
+            {
+                AbstractMessagingPort *mp = cc().port();
+                if( mp )
+                    mp->tag |= 1;
+            }
+
+            if( cmdObj["pv"].Int() != 1 ) {
+                errmsg = "incompatible replset protocol version";
+                return false;
+            }
+            {
+                string s = string(cmdObj.getStringField("replSetHeartbeat"));
+                if( cmdLine.ourSetName() != s ) {
+                    errmsg = "repl set names do not match";
+                    log() << "replSet set names do not match, our cmdline: " << cmdLine._replSet << rsLog;
+                    log() << "replSet s: " << s << rsLog;
+                    result.append("mismatch", true);
+                    return false;
+                }
+            }
+
+            result.append("rs", true);
+            if( cmdObj["checkEmpty"].trueValue() ) {
+                result.append("hasData", replHasDatabases());
+            }
+            if( theReplSet == 0 ) {
+                string from( cmdObj.getStringField("from") );
+                if( !from.empty() ) {
+                    scoped_lock lck( replSettings.discoveredSeeds_mx );
+                    replSettings.discoveredSeeds.insert(from);
+                }
+                result.append("hbmsg", "still initializing");
+                return true;
+            }
+
+            if( theReplSet->name() != cmdObj.getStringField("replSetHeartbeat") ) {
+                errmsg = "repl set names do not match (2)";
+                result.append("mismatch", true);
+                return false;
+            }
+            result.append("set", theReplSet->name());
+            result.append("state", theReplSet->state().s);
+            result.append("e", theReplSet->iAmElectable());
+            result.append("hbmsg", theReplSet->hbmsg());
+            result.append("time", (long long) time(0));
+            result.appendDate("opTime", theReplSet->lastOpTimeWritten.asDate());
+            int v = theReplSet->config().version;
+            result.append("v", v);
+            if( v > cmdObj["v"].Int() )
+                result << "config" << theReplSet->config().asBson();
+
+            return true;
+        }
+    } cmdReplSetHeartbeat;
+
+    bool requestHeartbeat(string setName, string from, string memberFullName, BSONObj& result,
+                          int myCfgVersion, int& theirCfgVersion, bool checkEmpty) {
+        if( replSetBlind ) {
+            return false;
+        }
+
+        BSONObj cmd = BSON( "replSetHeartbeat" << setName <<
+                            "v" << myCfgVersion <<
+                            "pv" << 1 <<
+                            "checkEmpty" << checkEmpty <<
+                            "from" << from );
+
+        // generally not a great idea to do outbound waiting calls in a
+        // write lock. heartbeats can be slow (multisecond to respond), so
+        // generally we don't want to be locked, at least not without
+        // thinking acarefully about it first.
+        uassert(15900, "can't heartbeat: too much lock",
+                !d.dbMutex.isWriteLocked() || theReplSet == 0 || !theReplSet->lockedByMe() );
+
+        ScopedConn conn(memberFullName);
+        return conn.runCommand("admin", cmd, result, 0);
+    }
+
+    /**
+     * Poll every other set member to check its status.
+     *
+     * A detail about local machines and authentication: suppose we have 2
+     * members, A and B, on the same machine using different keyFiles. A is
+     * primary. If we're just starting the set, there are no admin users, so A
+     * and B can access each other because it's local access.
+     *
+     * Then we add a user to A. B cannot sync this user from A, because as soon
+     * as we add a an admin user, A requires auth. However, A can still
+     * heartbeat B, because B *doesn't* have an admin user.  So A can reach B
+     * but B cannot reach A.
+     *
+     * Once B is restarted with the correct keyFile, everything should work as
+     * expected.
+     */
+    class ReplSetHealthPollTask : public task::Task {
+    private:
+        HostAndPort h;
+        HeartbeatInfo m;
+        int tries;
+        const int threshold;
+    public:
+        ReplSetHealthPollTask(const HostAndPort& hh, const HeartbeatInfo& mm)
+            : h(hh), m(mm), tries(0), threshold(15) { }
+
+        string name() const { return "rsHealthPoll"; }
+        void doWork() {
+            if ( !theReplSet ) {
+                LOG(2) << "replSet not initialized yet, skipping health poll this round" << rsLog;
+                return;
+            }
+
+            HeartbeatInfo mem = m;
+            HeartbeatInfo old = mem;
+            try {
+                BSONObj info;
+                int theirConfigVersion = -10000;
+
+                bool ok = _requestHeartbeat(mem, info, theirConfigVersion);
+
+                // weight new ping with old pings
+                // on the first ping, just use the ping value
+                if (old.ping != 0) {
+                    mem.ping = (unsigned int)((old.ping * .8) + (mem.ping * .2));
+                }
+
+                if( ok ) {
+                    up(info, mem);
+                }
+                else if (!info["errmsg"].eoo() &&
+                         info["errmsg"].str() == "need to login") {
+                    authIssue(mem);
+                }
+                else {
+                    down(mem, info.getStringField("errmsg"));
+                }
+            }
+            catch(DBException& e) {
+                down(mem, e.what());
+            }
+            catch(...) {
+                down(mem, "replSet unexpected exception in ReplSetHealthPollTask");
+            }
+            m = mem;
+
+            theReplSet->mgr->send( boost::bind(&ReplSet::msgUpdateHBInfo, theReplSet, mem) );
+
+            static time_t last = 0;
+            time_t now = time(0);
+            bool changed = mem.changed(old);
+            if( changed ) {
+                if( old.hbstate != mem.hbstate )
+                    log() << "replSet member " << h.toString() << " is now in state " << mem.hbstate.toString() << rsLog;
+            }
+            if( changed || now-last>4 ) {
+                last = now;
+                theReplSet->mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
+            }
+        }
+
+    private:
+        bool _requestHeartbeat(HeartbeatInfo& mem, BSONObj& info, int& theirConfigVersion) {
+            if (tries++ % threshold == (threshold - 1)) {
+                ScopedConn conn(h.toString());
+                conn.reconnect();
+            }
+
+            Timer timer;
+            time_t before = curTimeMicros64() / 1000000;
+
+            bool ok = requestHeartbeat(theReplSet->name(), theReplSet->selfFullName(),
+                                       h.toString(), info, theReplSet->config().version, theirConfigVersion);
+
+            mem.ping = (unsigned int)timer.millis();
+
+            // we set this on any response - we don't get this far if
+            // couldn't connect because exception is thrown
+            time_t after = mem.lastHeartbeat = before + (mem.ping / 1000);
+
+            if ( info["time"].isNumber() ) {
+                long long t = info["time"].numberLong();
+                if( t > after )
+                    mem.skew = (int) (t - after);
+                else if( t < before )
+                    mem.skew = (int) (t - before); // negative
+            }
+            else {
+                // it won't be there if remote hasn't initialized yet
+                if( info.hasElement("time") )
+                    warning() << "heatbeat.time isn't a number: " << info << endl;
+                mem.skew = INT_MIN;
+            }
+
+            {
+                be state = info["state"];
+                if( state.ok() )
+                    mem.hbstate = MemberState(state.Int());
+            }
+
+            return ok;
+        }
+
+        void authIssue(HeartbeatInfo& mem) {
+            mem.authIssue = true;
+            mem.hbstate = MemberState::RS_UNKNOWN;
+
+            // set health to 0 so that this doesn't count towards majority
+            mem.health = 0.0;
+            theReplSet->rmFromElectable(mem.id());
+        }
+
+        void down(HeartbeatInfo& mem, string msg) {
+            mem.authIssue = false;
+            mem.health = 0.0;
+            mem.ping = 0;
+            if( mem.upSince || mem.downSince == 0 ) {
+                mem.upSince = 0;
+                mem.downSince = jsTime();
+                mem.hbstate = MemberState::RS_DOWN;
+                log() << "replSet info " << h.toString() << " is down (or slow to respond): " << msg << rsLog;
+            }
+            mem.lastHeartbeatMsg = msg;
+            theReplSet->rmFromElectable(mem.id());
+        }
+
+        void up(const BSONObj& info, HeartbeatInfo& mem) {
+            HeartbeatInfo::numPings++;
+            mem.authIssue = false;
+
+            if( mem.upSince == 0 ) {
+                log() << "replSet member " << h.toString() << " is up" << rsLog;
+                mem.upSince = mem.lastHeartbeat;
+            }
+            mem.health = 1.0;
+            mem.lastHeartbeatMsg = info["hbmsg"].String();
+            if( info.hasElement("opTime") )
+                mem.opTime = info["opTime"].Date();
+
+            // see if this member is in the electable set
+            if( info["e"].eoo() ) {
+                // for backwards compatibility
+                const Member *member = theReplSet->findById(mem.id());
+                if (member && member->config().potentiallyHot()) {
+                    theReplSet->addToElectable(mem.id());
+                }
+                else {
+                    theReplSet->rmFromElectable(mem.id());
+                }
+            }
+            // add this server to the electable set if it is within 10
+            // seconds of the latest optime we know of
+            else if( info["e"].trueValue() &&
+                     mem.opTime >= theReplSet->lastOpTimeWritten.getSecs() - 10) {
+                unsigned lastOp = theReplSet->lastOtherOpTime().getSecs();
+                if (lastOp > 0 && mem.opTime >= lastOp - 10) {
+                    theReplSet->addToElectable(mem.id());
+                }
+            }
+            else {
+                theReplSet->rmFromElectable(mem.id());
+            }
+
+            be cfg = info["config"];
+            if( cfg.ok() ) {
+                // received a new config
+                boost::function<void()> f =
+                    boost::bind(&Manager::msgReceivedNewConfig, theReplSet->mgr, cfg.Obj().copy());
+                theReplSet->mgr->send(f);
+            }
+        }
+    };
+
+    void ReplSetImpl::endOldHealthTasks() {
+        unsigned sz = healthTasks.size();
+        for( set<ReplSetHealthPollTask*>::iterator i = healthTasks.begin(); i != healthTasks.end(); i++ )
+            (*i)->halt();
+        healthTasks.clear();
+        if( sz )
+            DEV log() << "replSet debug: cleared old tasks " << sz << endl;
+    }
+
+    void ReplSetImpl::startHealthTaskFor(Member *m) {
+        ReplSetHealthPollTask *task = new ReplSetHealthPollTask(m->h(), m->hbinfo());
+        healthTasks.insert(task);
+        task::repeat(task, 2000);
+    }
+
+    void startSyncThread();
+
+    /** called during repl set startup.  caller expects it to return fairly quickly.
+        note ReplSet object is only created once we get a config - so this won't run
+        until the initiation.
+    */
+    void ReplSetImpl::startThreads() {
+        task::fork(mgr);
+        mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
+
+        boost::thread t(startSyncThread);
+
+        task::fork(ghost);
+
+        // member heartbeats are started in ReplSetImpl::initFromConfig
+    }
+
+}
+
+/* todo:
+   stop bg job and delete on removefromset
+*/
diff --git a/src/mongo/db/repl/manager.cpp b/src/mongo/db/repl/manager.cpp
new file mode 100644
index 00000000000..91648a1b506
--- /dev/null
+++ b/src/mongo/db/repl/manager.cpp
@@ -0,0 +1,274 @@
+/* @file manager.cpp
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,b
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "rs.h"
+#include "connections.h"
+#include "../client.h"
+
+namespace mongo {
+
+    enum {
+        NOPRIMARY = -2,
+        SELFPRIMARY = -1
+    };
+
+    /* check members OTHER THAN US to see if they think they are primary */
+    const Member * Manager::findOtherPrimary(bool& two) {
+        two = false;
+        Member *m = rs->head();
+        Member *p = 0;
+        while( m ) {
+            DEV assert( m != rs->_self );
+            if( m->state().primary() && m->hbinfo().up() ) {
+                if( p ) {
+                    two = true;
+                    return 0;
+                }
+                p = m;
+            }
+            m = m->next();
+        }
+        if( p )
+            noteARemoteIsPrimary(p);
+        return p;
+    }
+
+    Manager::Manager(ReplSetImpl *_rs) :
+        task::Server("rsMgr"), rs(_rs), busyWithElectSelf(false), _primary(NOPRIMARY) {
+    }
+
+    Manager::~Manager() {
+        /* we don't destroy the replset object we sit in; however, the destructor could have thrown on init.
+           the log message below is just a reminder to come back one day and review this code more, and to
+           make it cleaner.
+           */
+        log() << "info: ~Manager called" << rsLog;
+        rs->mgr = 0;
+    }
+
+    void Manager::starting() {
+        Client::initThread("rsMgr");
+        replLocalAuth();
+    }
+
+    void Manager::noteARemoteIsPrimary(const Member *m) {
+        if( rs->box.getPrimary() == m )
+            return;
+        rs->_self->lhb() = "";
+        if( rs->iAmArbiterOnly() ) {
+            rs->box.set(MemberState::RS_ARBITER, m);
+        }
+        else {
+            rs->box.noteRemoteIsPrimary(m);
+        }
+    }
+
+    void Manager::checkElectableSet() {
+        unsigned otherOp = rs->lastOtherOpTime().getSecs();
+        
+        // make sure the electable set is up-to-date
+        if (rs->elect.aMajoritySeemsToBeUp() &&
+            rs->iAmPotentiallyHot() &&
+            (otherOp == 0 || rs->lastOpTimeWritten.getSecs() >= otherOp - 10)) {
+            theReplSet->addToElectable(rs->selfId());
+        }
+        else {
+            theReplSet->rmFromElectable(rs->selfId());
+        }
+
+        // check if we should ask the primary (possibly ourselves) to step down
+        const Member *highestPriority = theReplSet->getMostElectable();
+        const Member *primary = rs->box.getPrimary();
+        
+        if (primary && highestPriority &&
+            highestPriority->config().priority > primary->config().priority) {
+            log() << "stepping down " << primary->fullName() << endl;
+
+            if (primary->h().isSelf()) {
+                // replSetStepDown tries to acquire the same lock
+                // msgCheckNewState takes, so we can't call replSetStepDown on
+                // ourselves.
+                rs->relinquish();
+            }
+            else {
+                BSONObj cmd = BSON( "replSetStepDown" << 1 );
+                ScopedConn conn(primary->fullName());
+                BSONObj result;
+                if (!conn.runCommand("admin", cmd, result, 0)) {
+                    log() << "stepping down " << primary->fullName()
+                          << " failed: " << result << endl;
+                }
+            }
+        }
+    }
+
+    void Manager::checkAuth() {
+        int down = 0, authIssue = 0, total = 0;
+
+        for( Member *m = rs->head(); m; m=m->next() ) {
+            total++;
+
+            // all authIssue servers will also be not up
+            if (!m->hbinfo().up()) {
+                down++;
+                if (m->hbinfo().authIssue) {
+                    authIssue++;
+                }
+            }
+        }
+
+        // if all nodes are down or failed auth AND at least one failed
+        // auth, go into recovering.  If all nodes are down, stay a
+        // secondary.
+        if (authIssue > 0 && down == total) {
+            log() << "replset error could not reach/authenticate against any members" << endl;
+
+            if (rs->box.getPrimary() == rs->_self) {
+                log() << "auth problems, relinquishing primary" << rsLog;
+                rs->relinquish();
+            }
+
+            rs->blockSync(true);
+        }
+        else {
+            rs->blockSync(false);
+        }
+    }
+
+    /** called as the health threads get new results */
+    void Manager::msgCheckNewState() {
+        {
+            theReplSet->assertValid();
+            rs->assertValid();
+
+            RSBase::lock lk(rs);
+
+            if( busyWithElectSelf ) return;
+            
+            checkElectableSet();
+            checkAuth();
+
+            const Member *p = rs->box.getPrimary();
+            if( p && p != rs->_self ) {
+                if( !p->hbinfo().up() ||
+                        !p->hbinfo().hbstate.primary() ) {
+                    p = 0;
+                    rs->box.setOtherPrimary(0);
+                }
+            }
+
+            const Member *p2;
+            {
+                bool two;
+                p2 = findOtherPrimary(two);
+                if( two ) {
+                    /* two other nodes think they are primary (asynchronously polled) -- wait for things to settle down. */
+                    log() << "replSet info two primaries (transiently)" << rsLog;
+                    return;
+                }
+            }
+
+            if( p2 ) {
+                /* someone else thinks they are primary. */
+                if( p == p2 ) {
+                    // we thought the same; all set.
+                    return;
+                }
+                if( p == 0 ) {
+                    noteARemoteIsPrimary(p2);
+                    return;
+                }
+                // todo xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+                if( p != rs->_self ) {
+                    // switch primary from oldremotep->newremotep2
+                    noteARemoteIsPrimary(p2);
+                    return;
+                }
+                /* we thought we were primary, yet now someone else thinks they are. */
+                if( !rs->elect.aMajoritySeemsToBeUp() ) {
+                    /* we can't see a majority.  so the other node is probably the right choice. */
+                    noteARemoteIsPrimary(p2);
+                    return;
+                }
+                /* ignore for now, keep thinking we are master.
+                   this could just be timing (we poll every couple seconds) or could indicate
+                   a problem?  if it happens consistently for a duration of time we should
+                   alert the sysadmin.
+                */
+                return;
+            }
+
+            /* didn't find anyone who wants to be primary */
+
+            if( p ) {
+                /* we are already primary */
+
+                if( p != rs->_self ) {
+                    rs->sethbmsg("error p != rs->self in checkNewState");
+                    log() << "replSet " << p->fullName() << rsLog;
+                    log() << "replSet " << rs->_self->fullName() << rsLog;
+                    return;
+                }
+
+                if( rs->elect.shouldRelinquish() ) {
+                    log() << "can't see a majority of the set, relinquishing primary" << rsLog;
+                    rs->relinquish();
+                }
+
+                return;
+            }
+
+            if( !rs->iAmPotentiallyHot() ) { // if not we never try to be primary
+                OCCASIONALLY log() << "replSet I don't see a primary and I can't elect myself" << endl;
+                return;
+            }
+
+            /* no one seems to be primary.  shall we try to elect ourself? */
+            if( !rs->elect.aMajoritySeemsToBeUp() ) {
+                static time_t last;
+                static int n;
+                int ll = 0;
+                if( ++n > 5 ) ll++;
+                if( last + 60 > time(0 ) ) ll++;
+                log(ll) << "replSet can't see a majority, will not try to elect self" << rsLog;
+                last = time(0);
+                return;
+            }
+
+            if( !rs->iAmElectable() ) {
+                return;
+            }
+
+            busyWithElectSelf = true; // don't try to do further elections & such while we are already working on one.
+        }
+        try {
+            rs->elect.electSelf();
+        }
+        catch(RetryAfterSleepException&) {
+            /* we want to process new inbounds before trying this again.  so we just put a checkNewstate in the queue for eval later. */
+            requeue();
+        }
+        catch(...) {
+            log() << "replSet error unexpected assertion in rs manager" << rsLog;
+        }
+        busyWithElectSelf = false;
+    }
+
+}
diff --git a/src/mongo/db/repl/multicmd.h b/src/mongo/db/repl/multicmd.h
new file mode 100644
index 00000000000..2d70c551f64
--- /dev/null
+++ b/src/mongo/db/repl/multicmd.h
@@ -0,0 +1,75 @@
+// @file multicmd.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../util/background.h"
+#include "connections.h"
+
+namespace mongo {
+
+    struct Target {
+        Target(string hostport) : toHost(hostport), ok(false) { }
+        //Target() : ok(false) { }
+        const string toHost;
+        bool ok;
+        BSONObj result;
+    };
+
+    /** send a command to several servers in parallel.  waits for all to complete before 
+        returning.  
+        
+        in: Target::toHost
+        out: Target::result and Target::ok
+    */
+    void multiCommand(BSONObj cmd, list<Target>& L);
+
+    class _MultiCommandJob : public BackgroundJob {
+    public:
+        BSONObj& cmd;
+        Target& d;
+        _MultiCommandJob(BSONObj& _cmd, Target& _d) : cmd(_cmd), d(_d) { }
+
+    private:
+        string name() const { return "MultiCommandJob"; }
+        void run() {
+            try {
+                ScopedConn c(d.toHost);
+                d.ok = c.runCommand("admin", cmd, d.result);
+            }
+            catch(DBException&) {
+                DEV log() << "dev caught dbexception on multiCommand " << d.toHost << rsLog;
+            }
+        }
+    };
+
+    inline void multiCommand(BSONObj cmd, list<Target>& L) {
+        list< shared_ptr<BackgroundJob> > jobs;
+
+        for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
+            Target& d = *i;
+            _MultiCommandJob *j = new _MultiCommandJob(cmd, d);
+            jobs.push_back( shared_ptr<BackgroundJob>(j) );
+            j->go();
+        }
+
+        for( list< shared_ptr<BackgroundJob> >::iterator i = jobs.begin(); i != jobs.end(); i++ ) {
+            (*i)->wait();
+        }
+    }
+}
diff --git a/src/mongo/db/repl/replset_commands.cpp b/src/mongo/db/repl/replset_commands.cpp
new file mode 100644
index 00000000000..84f16e53466
--- /dev/null
+++ b/src/mongo/db/repl/replset_commands.cpp
@@ -0,0 +1,404 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../cmdline.h"
+#include "../commands.h"
+#include "../repl.h"
+#include "health.h"
+#include "rs.h"
+#include "rs_config.h"
+#include "../dbwebserver.h"
+#include "../../util/mongoutils/html.h"
+#include "../../client/dbclient.h"
+#include "../repl_block.h"
+
+using namespace bson;
+
+namespace mongo {
+
+    void checkMembersUpForConfigChange(const ReplSetConfig& cfg, BSONObjBuilder& result, bool initial);
+
+    /* commands in other files:
+         replSetHeartbeat - health.cpp
+         replSetInitiate  - rs_mod.cpp
+    */
+
+    bool replSetBlind = false;
+    unsigned replSetForceInitialSyncFailure = 0;
+
+    class CmdReplSetTest : public ReplSetCommand {
+    public:
+        virtual void help( stringstream &help ) const {
+            help << "Just for regression tests.\n";
+        }
+        CmdReplSetTest() : ReplSetCommand("replSetTest") { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            log() << "replSet replSetTest command received: " << cmdObj.toString() << rsLog;
+
+            if (!checkAuth(errmsg, result)) {
+                return false;
+            }
+
+            if( cmdObj.hasElement("forceInitialSyncFailure") ) {
+                replSetForceInitialSyncFailure = (unsigned) cmdObj["forceInitialSyncFailure"].Number();
+                return true;
+            }
+
+            if( !check(errmsg, result) )
+                return false;
+
+            if( cmdObj.hasElement("blind") ) {
+                replSetBlind = cmdObj.getBoolField("blind");
+                return true;
+            }
+
+            if (cmdObj.hasElement("sethbmsg")) {
+                replset::sethbmsg(cmdObj["sethbmsg"].String());
+                return true;
+            }
+
+            return false;
+        }
+    } cmdReplSetTest;
+
+    /** get rollback id.  used to check if a rollback happened during some interval of time.
+        as consumed, the rollback id is not in any particular order, it simply changes on each rollback.
+        @see incRBID()
+    */
+    class CmdReplSetGetRBID : public ReplSetCommand {
+    public:
+        /* todo: ideally this should only change on rollbacks NOT on mongod restarts also. fix... */
+        int rbid;
+        virtual void help( stringstream &help ) const {
+            help << "internal";
+        }
+        CmdReplSetGetRBID() : ReplSetCommand("replSetGetRBID") {
+            // this is ok but micros or combo with some rand() and/or 64 bits might be better --
+            // imagine a restart and a clock correction simultaneously (very unlikely but possible...)
+            rbid = (int) curTimeMillis64();
+        }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( !check(errmsg, result) )
+                return false;
+            result.append("rbid",rbid);
+            return true;
+        }
+    } cmdReplSetRBID;
+
+    /** we increment the rollback id on every rollback event. */
+    void incRBID() {
+        cmdReplSetRBID.rbid++;
+    }
+
+    /** helper to get rollback id from another server. */
+    int getRBID(DBClientConnection *c) {
+        bo info;
+        c->simpleCommand("admin", &info, "replSetGetRBID");
+        return info["rbid"].numberInt();
+    }
+
+    class CmdReplSetGetStatus : public ReplSetCommand {
+    public:
+        virtual void help( stringstream &help ) const {
+            help << "Report status of a replica set from the POV of this server\n";
+            help << "{ replSetGetStatus : 1 }";
+            help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+        }
+        CmdReplSetGetStatus() : ReplSetCommand("replSetGetStatus", true) { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( cmdObj["forShell"].trueValue() )
+                lastError.disableForCommand();
+
+            if( !check(errmsg, result) )
+                return false;
+            theReplSet->summarizeStatus(result);
+            return true;
+        }
+    } cmdReplSetGetStatus;
+
+    class CmdReplSetReconfig : public ReplSetCommand {
+        RWLock mutex; /* we don't need rw but we wanted try capability. :-( */
+    public:
+        virtual void help( stringstream &help ) const {
+            help << "Adjust configuration of a replica set\n";
+            help << "{ replSetReconfig : config_object }";
+            help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+        }
+        CmdReplSetReconfig() : ReplSetCommand("replSetReconfig"), mutex("rsreconfig") { }
+        virtual bool run(const string& a, BSONObj& b, int e, string& errmsg, BSONObjBuilder& c, bool d) {
+            try {
+                rwlock_try_write lk(mutex);
+                return _run(a,b,e,errmsg,c,d);
+            }
+            catch(rwlock_try_write::exception&) { }
+            errmsg = "a replSetReconfig is already in progress";
+            return false;
+        }
+    private:
+        bool _run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if ( !checkAuth(errmsg, result) ) {
+                return false;
+            }
+
+            if( cmdObj["replSetReconfig"].type() != Object ) {
+                errmsg = "no configuration specified";
+                return false;
+            }
+
+            bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue();
+            if( force && !theReplSet ) {
+                replSettings.reconfig = cmdObj["replSetReconfig"].Obj().getOwned();
+                result.append("msg", "will try this config momentarily, try running rs.conf() again in a few seconds");
+                return true;
+            }
+
+            if ( !check(errmsg, result) ) {
+                return false;
+            }
+
+            if( !force && !theReplSet->box.getState().primary() ) {
+                errmsg = "replSetReconfig command must be sent to the current replica set primary.";
+                return false;
+            }
+
+            {
+                // just make sure we can get a write lock before doing anything else.  we'll reacquire one
+                // later.  of course it could be stuck then, but this check lowers the risk if weird things
+                // are up - we probably don't want a change to apply 30 minutes after the initial attempt.
+                time_t t = time(0);
+                writelock lk("");
+                if( time(0)-t > 20 ) {
+                    errmsg = "took a long time to get write lock, so not initiating.  Initiate when server less busy?";
+                    return false;
+                }
+            }
+
+            try {
+                ReplSetConfig newConfig(cmdObj["replSetReconfig"].Obj(), force);
+
+                log() << "replSet replSetReconfig config object parses ok, " << newConfig.members.size() << " members specified" << rsLog;
+
+                if( !ReplSetConfig::legalChange(theReplSet->getConfig(), newConfig, errmsg) ) {
+                    return false;
+                }
+
+                checkMembersUpForConfigChange(newConfig, result, false);
+
+                log() << "replSet replSetReconfig [2]" << rsLog;
+
+                theReplSet->haveNewConfig(newConfig, true);
+                ReplSet::startupStatusMsg.set("replSetReconfig'd");
+            }
+            catch( DBException& e ) {
+                log() << "replSet replSetReconfig exception: " << e.what() << rsLog;
+                throw;
+            }
+            catch( string& se ) {
+                log() << "replSet reconfig exception: " << se << rsLog;
+                errmsg = se;
+                return false;
+            }
+
+            resetSlaveCache();
+            return true;
+        }
+    } cmdReplSetReconfig;
+
+    class CmdReplSetFreeze : public ReplSetCommand {
+    public:
+        virtual void help( stringstream &help ) const {
+            help << "{ replSetFreeze : <seconds> }";
+            help << "'freeze' state of member to the extent we can do that.  What this really means is that\n";
+            help << "this node will not attempt to become primary until the time period specified expires.\n";
+            help << "You can call again with {replSetFreeze:0} to unfreeze sooner.\n";
+            help << "A process restart unfreezes the member also.\n";
+            help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+        }
+
+        CmdReplSetFreeze() : ReplSetCommand("replSetFreeze") { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( !check(errmsg, result) )
+                return false;
+            int secs = (int) cmdObj.firstElement().numberInt();
+            if( theReplSet->freeze(secs) ) {
+                if( secs == 0 )
+                    result.append("info","unfreezing");
+            }
+            if( secs == 1 )
+                result.append("warning", "you really want to freeze for only 1 second?");
+            return true;
+        }
+    } cmdReplSetFreeze;
+
+    class CmdReplSetStepDown: public ReplSetCommand {
+    public:
+        virtual void help( stringstream &help ) const {
+            help << "{ replSetStepDown : <seconds> }\n";
+            help << "Step down as primary.  Will not try to reelect self for the specified time period (1 minute if no numeric secs value specified).\n";
+            help << "(If another member with same priority takes over in the meantime, it will stay primary.)\n";
+            help << "http://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+        }
+
+        CmdReplSetStepDown() : ReplSetCommand("replSetStepDown") { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( !check(errmsg, result) )
+                return false;
+            if( !theReplSet->box.getState().primary() ) {
+                errmsg = "not primary so can't step down";
+                return false;
+            }
+
+            bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue();
+
+            // only step down if there is another node synced to within 10
+            // seconds of this node
+            if (!force) {
+                long long int lastOp = (long long int)theReplSet->lastOpTimeWritten.getSecs();
+                long long int closest = (long long int)theReplSet->lastOtherOpTime().getSecs();
+
+                long long int diff = lastOp - closest;
+                result.append("closest", closest);
+                result.append("difference", diff);
+
+                if (diff < 0) {
+                    // not our problem, but we'll wait until thing settle down
+                    errmsg = "someone is ahead of the primary?";
+                    return false;
+                }
+
+                if (diff > 10) {
+                    errmsg = "no secondaries within 10 seconds of my optime";
+                    return false;
+                }
+            }
+
+            int secs = (int) cmdObj.firstElement().numberInt();
+            if( secs == 0 )
+                secs = 60;
+            return theReplSet->stepDown(secs);
+        }
+    } cmdReplSetStepDown;
+
+    class CmdReplSetMaintenance: public ReplSetCommand {
+    public:
+        virtual void help( stringstream &help ) const {
+            help << "{ replSetMaintenance : bool }\n";
+            help << "Enable or disable maintenance mode.";
+        }
+
+        CmdReplSetMaintenance() : ReplSetCommand("replSetMaintenance") { }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            if( !check(errmsg, result) )
+                return false;
+            if( theReplSet->box.getState().primary() ) {
+                errmsg = "primaries can't modify maintenance mode";
+                return false;
+            }
+
+            theReplSet->setMaintenanceMode(cmdObj["replSetMaintenance"].trueValue());
+            return true;
+        }
+    } cmdReplSetMaintenance;
+
+    using namespace bson;
+    using namespace mongoutils::html;
+    extern void fillRsLog(stringstream&);
+
+    class ReplSetHandler : public DbWebHandler {
+    public:
+        ReplSetHandler() : DbWebHandler( "_replSet" , 1 , true ) {}
+
+        virtual bool handles( const string& url ) const {
+            return startsWith( url , "/_replSet" );
+        }
+
+        virtual void handle( const char *rq, string url, BSONObj params,
+                             string& responseMsg, int& responseCode,
+                             vector<string>& headers,  const SockAddr &from ) {
+
+            if( url == "/_replSetOplog" ) {
+                responseMsg = _replSetOplog(params);
+            }
+            else
+                responseMsg = _replSet();
+            responseCode = 200;
+        }
+
+        string _replSetOplog(bo parms) {
+            int _id = (int) str::toUnsigned( parms["_id"].String() );
+
+            stringstream s;
+            string t = "Replication oplog";
+            s << start(t);
+            s << p(t);
+
+            if( theReplSet == 0 ) {
+                if( cmdLine._replSet.empty() )
+                    s << p("Not using --replSet");
+                else  {
+                    s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated")
+                           + ".<br>" + ReplSet::startupStatusMsg.get());
+                }
+            }
+            else {
+                try {
+                    theReplSet->getOplogDiagsAsHtml(_id, s);
+                }
+                catch(std::exception& e) {
+                    s << "error querying oplog: " << e.what() << '\n';
+                }
+            }
+
+            s << _end();
+            return s.str();
+        }
+
+        /* /_replSet show replica set status in html format */
+        string _replSet() {
+            stringstream s;
+            s << start("Replica Set Status " + prettyHostName());
+            s << p( a("/", "back", "Home") + " | " +
+                    a("/local/system.replset/?html=1", "", "View Replset Config") + " | " +
+                    a("/replSetGetStatus?text=1", "", "replSetGetStatus") + " | " +
+                    a("http://www.mongodb.org/display/DOCS/Replica+Sets", "", "Docs")
+                  );
+
+            if( theReplSet == 0 ) {
+                if( cmdLine._replSet.empty() )
+                    s << p("Not using --replSet");
+                else  {
+                    s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated")
+                           + ".<br>" + ReplSet::startupStatusMsg.get());
+                }
+            }
+            else {
+                try {
+                    theReplSet->summarizeAsHtml(s);
+                }
+                catch(...) { s << "error summarizing replset status\n"; }
+            }
+            s << p("Recent replset log activity:");
+            fillRsLog(s);
+            s << _end();
+            return s.str();
+        }
+
+
+
+    } replSetHandler;
+
+}
diff --git a/src/mongo/db/repl/rs.cpp b/src/mongo/db/repl/rs.cpp
new file mode 100644
index 00000000000..fff5d72bcc0
--- /dev/null
+++ b/src/mongo/db/repl/rs.cpp
@@ -0,0 +1,778 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../cmdline.h"
+#include "../../util/net/sock.h"
+#include "../client.h"
+#include "../../client/dbclient.h"
+#include "../dbhelpers.h"
+#include "../../s/d_logic.h"
+#include "rs.h"
+#include "connections.h"
+#include "../repl.h"
+#include "../instance.h"
+
+using namespace std;
+
+namespace mongo {
+    
+    using namespace bson;
+
+    bool replSet = false;
+    ReplSet *theReplSet = 0;
+
+    bool isCurrentlyAReplSetPrimary() { 
+        return theReplSet && theReplSet->isPrimary();
+    }
+
+    void replset::sethbmsg(const string& s, const int level) {
+        if (theReplSet) {
+            theReplSet->sethbmsg(s, logLevel);
+        }
+    }
+
+    void ReplSetImpl::sethbmsg(string s, int logLevel) {
+        static time_t lastLogged;
+        _hbmsgTime = time(0);
+
+        if( s == _hbmsg ) {
+            // unchanged
+            if( _hbmsgTime - lastLogged < 60 )
+                return;
+        }
+
+        unsigned sz = s.size();
+        if( sz >= 256 )
+            memcpy(_hbmsg, s.c_str(), 255);
+        else {
+            _hbmsg[sz] = 0;
+            memcpy(_hbmsg, s.c_str(), sz);
+        }
+        if( !s.empty() ) {
+            lastLogged = _hbmsgTime;
+            log(logLevel) << "replSet " << s << rsLog;
+        }
+    }
+
+    void ReplSetImpl::assumePrimary() {
+        LOG(2) << "replSet assuming primary" << endl;
+        assert( iAmPotentiallyHot() );
+        writelock lk("admin."); // so we are synchronized with _logOp()
+
+        // Make sure that new OpTimes are higher than existing ones even with clock skew
+        DBDirectClient c;
+        BSONObj lastOp = c.findOne( "local.oplog.rs", Query().sort(reverseNaturalObj), NULL, QueryOption_SlaveOk );
+        if ( !lastOp.isEmpty() ) {
+            OpTime::setLast( lastOp[ "ts" ].date() );
+        }
+
+        changeState(MemberState::RS_PRIMARY);
+    }
+
+    void ReplSetImpl::changeState(MemberState s) { box.change(s, _self); }
+
+    void ReplSetImpl::setMaintenanceMode(const bool inc) {
+        lock lk(this);
+
+        if (inc) {
+            log() << "replSet going into maintenance mode (" << _maintenanceMode << " other tasks)" << rsLog;
+
+            _maintenanceMode++;
+            changeState(MemberState::RS_RECOVERING);
+        }
+        else {
+            _maintenanceMode--;
+            // no need to change state, syncTail will try to go live as a secondary soon
+
+            log() << "leaving maintenance mode (" << _maintenanceMode << " other tasks)" << rsLog;
+        }
+    }
+
+    Member* ReplSetImpl::getMostElectable() {
+        lock lk(this);
+
+        Member *max = 0;
+
+        for (set<unsigned>::iterator it = _electableSet.begin(); it != _electableSet.end(); it++) {
+            const Member *temp = findById(*it);
+            if (!temp) {
+                log() << "couldn't find member: " << *it << endl;
+                _electableSet.erase(*it);
+                continue;
+            }
+            if (!max || max->config().priority < temp->config().priority) {
+                max = (Member*)temp;
+            }
+        }
+
+        return max;
+    }
+
+    const bool closeOnRelinquish = true;
+
+    void ReplSetImpl::relinquish() {
+        LOG(2) << "replSet attempting to relinquish" << endl;
+        if( box.getState().primary() ) {
+            {
+                writelock lk("admin."); // so we are synchronized with _logOp()
+            
+                log() << "replSet relinquishing primary state" << rsLog;
+                changeState(MemberState::RS_SECONDARY);
+            }
+            
+            if( closeOnRelinquish ) {
+                /* close sockets that were talking to us so they don't blithly send many writes that will fail
+                   with "not master" (of course client could check result code, but in case they are not)
+                */
+                log() << "replSet closing client sockets after reqlinquishing primary" << rsLog;
+                MessagingPort::closeAllSockets(1);
+            }
+
+            // now that all connections were closed, strip this mongod from all sharding details
+            // if and when it gets promoted to a primary again, only then it should reload the sharding state
+            // the rationale here is that this mongod won't bring stale state when it regains primaryhood
+            shardingState.resetShardingState();
+
+        }
+        else if( box.getState().startup2() ) {
+            // ? add comment
+            changeState(MemberState::RS_RECOVERING);
+        }
+    }
+
+    /* look freshly for who is primary - includes relinquishing ourself. */
+    void ReplSetImpl::forgetPrimary() {
+        if( box.getState().primary() )
+            relinquish();
+        else {
+            box.setOtherPrimary(0);
+        }
+    }
+
+    // for the replSetStepDown command
+    bool ReplSetImpl::_stepDown(int secs) {
+        lock lk(this);
+        if( box.getState().primary() ) {
+            elect.steppedDown = time(0) + secs;
+            log() << "replSet info stepping down as primary secs=" << secs << rsLog;
+            relinquish();
+            return true;
+        }
+        return false;
+    }
+
+    bool ReplSetImpl::_freeze(int secs) {
+        lock lk(this);
+        /* note if we are primary we remain primary but won't try to elect ourself again until
+           this time period expires.
+           */
+        if( secs == 0 ) {
+            elect.steppedDown = 0;
+            log() << "replSet info 'unfreezing'" << rsLog;
+        }
+        else {
+            if( !box.getState().primary() ) {
+                elect.steppedDown = time(0) + secs;
+                log() << "replSet info 'freezing' for " << secs << " seconds" << rsLog;
+            }
+            else {
+                log() << "replSet info received freeze command but we are primary" << rsLog;
+            }
+        }
+        return true;
+    }
+
+    void ReplSetImpl::msgUpdateHBInfo(HeartbeatInfo h) {
+        for( Member *m = _members.head(); m; m=m->next() ) {
+            if( m->id() == h.id() ) {
+                m->_hbinfo = h;
+                return;
+            }
+        }
+    }
+
+    list<HostAndPort> ReplSetImpl::memberHostnames() const {
+        list<HostAndPort> L;
+        L.push_back(_self->h());
+        for( Member *m = _members.head(); m; m = m->next() )
+            L.push_back(m->h());
+        return L;
+    }
+
+    void ReplSetImpl::_fillIsMasterHost(const Member *m, vector<string>& hosts, vector<string>& passives, vector<string>& arbiters) {
+        assert( m );
+        if( m->config().hidden )
+            return;
+
+        if( m->potentiallyHot() ) {
+            hosts.push_back(m->h().toString());
+        }
+        else if( !m->config().arbiterOnly ) {
+            if( m->config().slaveDelay ) {
+                /* hmmm - we don't list these as they are stale. */
+            }
+            else {
+                passives.push_back(m->h().toString());
+            }
+        }
+        else {
+            arbiters.push_back(m->h().toString());
+        }
+    }
+
+    void ReplSetImpl::_fillIsMaster(BSONObjBuilder& b) {
+        lock lk(this);
+        
+        const StateBox::SP sp = box.get();
+        bool isp = sp.state.primary();
+        b.append("setName", name());
+        b.append("ismaster", isp);
+        b.append("secondary", sp.state.secondary());
+        {
+            vector<string> hosts, passives, arbiters;
+            _fillIsMasterHost(_self, hosts, passives, arbiters);
+
+            for( Member *m = _members.head(); m; m = m->next() ) {
+                assert( m );
+                _fillIsMasterHost(m, hosts, passives, arbiters);
+            }
+
+            if( hosts.size() > 0 ) {
+                b.append("hosts", hosts);
+            }
+            if( passives.size() > 0 ) {
+                b.append("passives", passives);
+            }
+            if( arbiters.size() > 0 ) {
+                b.append("arbiters", arbiters);
+            }
+        }
+
+        if( !isp ) {
+            const Member *m = sp.primary;
+            if( m )
+                b.append("primary", m->h().toString());
+        }
+        else {
+            b.append("primary", _self->fullName());
+        }
+
+        if( myConfig().arbiterOnly )
+            b.append("arbiterOnly", true);
+        if( myConfig().priority == 0 && !myConfig().arbiterOnly)
+            b.append("passive", true);
+        if( myConfig().slaveDelay )
+            b.append("slaveDelay", myConfig().slaveDelay);
+        if( myConfig().hidden )
+            b.append("hidden", true);
+        if( !myConfig().buildIndexes )
+            b.append("buildIndexes", false);
+        if( !myConfig().tags.empty() ) {
+            BSONObjBuilder a;
+            for( map<string,string>::const_iterator i = myConfig().tags.begin(); i != myConfig().tags.end(); i++ )
+                a.append((*i).first, (*i).second);
+            b.append("tags", a.done());
+        }
+        b.append("me", myConfig().h.toString());
+    }
+
+    /** @param cfgString <setname>/<seedhost1>,<seedhost2> */
+
+    void parseReplsetCmdLine(string cfgString, string& setname, vector<HostAndPort>& seeds, set<HostAndPort>& seedSet ) {
+        const char *p = cfgString.c_str();
+        const char *slash = strchr(p, '/');
+        if( slash )
+            setname = string(p, slash-p);
+        else
+            setname = p;
+        uassert(13093, "bad --replSet config string format is: <setname>[/<seedhost1>,<seedhost2>,...]", !setname.empty());
+
+        if( slash == 0 )
+            return;
+
+        p = slash + 1;
+        while( 1 ) {
+            const char *comma = strchr(p, ',');
+            if( comma == 0 ) comma = strchr(p,0);
+            if( p == comma )
+                break;
+            {
+                HostAndPort m;
+                try {
+                    m = HostAndPort( string(p, comma-p) );
+                }
+                catch(...) {
+                    uassert(13114, "bad --replSet seed hostname", false);
+                }
+                uassert(13096, "bad --replSet command line config string - dups?", seedSet.count(m) == 0 );
+                seedSet.insert(m);
+                //uassert(13101, "can't use localhost in replset host list", !m.isLocalHost());
+                if( m.isSelf() ) {
+                    log(1) << "replSet ignoring seed " << m.toString() << " (=self)" << rsLog;
+                }
+                else
+                    seeds.push_back(m);
+                if( *comma == 0 )
+                    break;
+                p = comma + 1;
+            }
+        }
+    }
+
+    ReplSetImpl::ReplSetImpl(ReplSetCmdline& replSetCmdline) : elect(this),
+        _currentSyncTarget(0),
+        _blockSync(false),
+        _hbmsgTime(0),
+        _self(0),
+        _maintenanceMode(0),
+        mgr( new Manager(this) ),
+        ghost( new GhostSync(this) ) {
+
+        _cfg = 0;
+        memset(_hbmsg, 0, sizeof(_hbmsg));
+        strcpy( _hbmsg , "initial startup" );
+        lastH = 0;
+        changeState(MemberState::RS_STARTUP);
+
+        _seeds = &replSetCmdline.seeds;
+
+        LOG(1) << "replSet beginning startup..." << rsLog;
+
+        loadConfig();
+
+        unsigned sss = replSetCmdline.seedSet.size();
+        for( Member *m = head(); m; m = m->next() ) {
+            replSetCmdline.seedSet.erase(m->h());
+        }
+        for( set<HostAndPort>::iterator i = replSetCmdline.seedSet.begin(); i != replSetCmdline.seedSet.end(); i++ ) {
+            if( i->isSelf() ) {
+                if( sss == 1 ) {
+                    LOG(1) << "replSet warning self is listed in the seed list and there are no other seeds listed did you intend that?" << rsLog;
+                }
+            }
+            else {
+                log() << "replSet warning command line seed " << i->toString() << " is not present in the current repl set config" << rsLog;
+            }
+        }
+    }
+
+    void newReplUp();
+
+    void ReplSetImpl::loadLastOpTimeWritten(bool quiet) {
+        readlock lk(rsoplog);
+        BSONObj o;
+        if( Helpers::getLast(rsoplog, o) ) {
+            lastH = o["h"].numberLong();
+            lastOpTimeWritten = o["ts"]._opTime();
+            uassert(13290, "bad replSet oplog entry?", quiet || !lastOpTimeWritten.isNull());
+        }
+    }
+
+    /* call after constructing to start - returns fairly quickly after launching its threads */
+    void ReplSetImpl::_go() {
+        try {
+            loadLastOpTimeWritten();
+        }
+        catch(std::exception& e) {
+            log() << "replSet error fatal couldn't query the local " << rsoplog << " collection.  Terminating mongod after 30 seconds." << rsLog;
+            log() << e.what() << rsLog;
+            sleepsecs(30);
+            dbexit( EXIT_REPLICATION_ERROR );
+            return;
+        }
+
+        changeState(MemberState::RS_STARTUP2);
+        startThreads();
+        newReplUp(); // oplog.cpp
+    }
+
+    ReplSetImpl::StartupStatus ReplSetImpl::startupStatus = PRESTART;
+    DiagStr ReplSetImpl::startupStatusMsg;
+
+    extern BSONObj *getLastErrorDefault;
+
+    void ReplSetImpl::setSelfTo(Member *m) {
+        // already locked in initFromConfig
+        _self = m;
+        _id = m->id();
+        _config = m->config();
+        if( m ) _buildIndexes = m->config().buildIndexes;
+        else _buildIndexes = true;
+    }
+
+    /** @param reconf true if this is a reconfiguration and not an initial load of the configuration.
+        @return true if ok; throws if config really bad; false if config doesn't include self
+    */
+    bool ReplSetImpl::initFromConfig(ReplSetConfig& c, bool reconf) {
+        /* NOTE: haveNewConfig() writes the new config to disk before we get here.  So
+                 we cannot error out at this point, except fatally.  Check errors earlier.
+                 */
+        lock lk(this);
+
+        if( getLastErrorDefault || !c.getLastErrorDefaults.isEmpty() ) {
+            // see comment in dbcommands.cpp for getlasterrordefault
+            getLastErrorDefault = new BSONObj( c.getLastErrorDefaults );
+        }
+
+        list<ReplSetConfig::MemberCfg*> newOnes;
+        // additive short-cuts the new config setup. If we are just adding a
+        // node/nodes and nothing else is changing, this is additive. If it's
+        // not a reconfig, we're not adding anything
+        bool additive = reconf;
+        {
+            unsigned nfound = 0;
+            int me = 0;
+            for( vector<ReplSetConfig::MemberCfg>::iterator i = c.members.begin(); i != c.members.end(); i++ ) {
+                
+                ReplSetConfig::MemberCfg& m = *i;
+                if( m.h.isSelf() ) {
+                    me++;
+                }
+                
+                if( reconf ) {
+                    if (m.h.isSelf() && (!_self || (int)_self->id() != m._id)) {
+                        log() << "self doesn't match: " << m._id << rsLog;
+                        assert(false);
+                    }
+
+                    const Member *old = findById(m._id);
+                    if( old ) {
+                        nfound++;
+                        assert( (int) old->id() == m._id );
+                        if( old->config() != m ) {
+                            additive = false;
+                        }
+                    }
+                    else {
+                        newOnes.push_back(&m);
+                    }
+                }
+            }
+            if( me == 0 ) {
+                _members.orphanAll();
+
+                // sending hbs must continue to pick up new config, so we leave
+                // hb threads alone
+
+                // close sockets to force clients to re-evaluate this member
+                MessagingPort::closeAllSockets(0);
+
+                // stop sync thread
+                box.set(MemberState::RS_STARTUP, 0);
+
+                // go into holding pattern
+                log() << "replSet error self not present in the repl set configuration:" << rsLog;
+                log() << c.toString() << rsLog;
+                return false;
+            }
+            uassert( 13302, "replSet error self appears twice in the repl set configuration", me<=1 );
+
+            // if we found different members that the original config, reload everything
+            if( reconf && config().members.size() != nfound )
+                additive = false;
+        }
+
+        _cfg = new ReplSetConfig(c);
+        assert( _cfg->ok() );
+        assert( _name.empty() || _name == _cfg->_id );
+        _name = _cfg->_id;
+        assert( !_name.empty() );
+
+        // this is a shortcut for simple changes
+        if( additive ) {
+            log() << "replSet info : additive change to configuration" << rsLog;
+            for( list<ReplSetConfig::MemberCfg*>::const_iterator i = newOnes.begin(); i != newOnes.end(); i++ ) {
+                ReplSetConfig::MemberCfg *m = *i;
+                Member *mi = new Member(m->h, m->_id, m, false);
+
+                /** we will indicate that new members are up() initially so that we don't relinquish our
+                    primary state because we can't (transiently) see a majority.  they should be up as we
+                    check that new members are up before getting here on reconfig anyway.
+                    */
+                mi->get_hbinfo().health = 0.1;
+
+                _members.push(mi);
+                startHealthTaskFor(mi);
+            }
+
+            // if we aren't creating new members, we may have to update the
+            // groups for the current ones
+            _cfg->updateMembers(_members);
+
+            return true;
+        }
+
+        // start with no members.  if this is a reconfig, drop the old ones.
+        _members.orphanAll();
+
+        endOldHealthTasks();
+
+        int oldPrimaryId = -1;
+        {
+            const Member *p = box.getPrimary();
+            if( p )
+                oldPrimaryId = p->id();
+        }
+        forgetPrimary();
+
+        // not setting _self to 0 as other threads use _self w/o locking
+        int me = 0;
+
+        // For logging
+        string members = "";
+
+        for( vector<ReplSetConfig::MemberCfg>::iterator i = _cfg->members.begin(); i != _cfg->members.end(); i++ ) {
+            ReplSetConfig::MemberCfg& m = *i;
+            Member *mi;
+            members += ( members == "" ? "" : ", " ) + m.h.toString();
+            if( m.h.isSelf() ) {
+                assert( me++ == 0 );
+                mi = new Member(m.h, m._id, &m, true);
+                if (!reconf) {
+                    log() << "replSet I am " << m.h.toString() << rsLog;
+                }
+                setSelfTo(mi);
+
+                if( (int)mi->id() == oldPrimaryId )
+                    box.setSelfPrimary(mi);
+            }
+            else {
+                mi = new Member(m.h, m._id, &m, false);
+                _members.push(mi);
+                startHealthTaskFor(mi);
+                if( (int)mi->id() == oldPrimaryId )
+                    box.setOtherPrimary(mi);
+            }
+        }
+
+        if( me == 0 ){
+            log() << "replSet warning did not detect own host in full reconfig, members " << members << " config: " << c << rsLog;
+        }
+
+        return true;
+    }
+
+    // Our own config must be the first one.
+    bool ReplSetImpl::_loadConfigFinish(vector<ReplSetConfig>& cfgs) {
+        int v = -1;
+        ReplSetConfig *highest = 0;
+        int myVersion = -2000;
+        int n = 0;
+        for( vector<ReplSetConfig>::iterator i = cfgs.begin(); i != cfgs.end(); i++ ) {
+            ReplSetConfig& cfg = *i;
+            if( ++n == 1 ) myVersion = cfg.version;
+            if( cfg.ok() && cfg.version > v ) {
+                highest = &cfg;
+                v = cfg.version;
+            }
+        }
+        assert( highest );
+
+        if( !initFromConfig(*highest) )
+            return false;
+
+        if( highest->version > myVersion && highest->version >= 0 ) {
+            log() << "replSet got config version " << highest->version << " from a remote, saving locally" << rsLog;
+            highest->saveConfigLocally(BSONObj());
+        }
+        return true;
+    }
+
+    void ReplSetImpl::loadConfig() {
+        while( 1 ) {
+            startupStatus = LOADINGCONFIG;
+            startupStatusMsg.set("loading " + rsConfigNs + " config (LOADINGCONFIG)");
+            LOG(1) << "loadConfig() " << rsConfigNs << endl;
+            try {
+                vector<ReplSetConfig> configs;
+                try {
+                    configs.push_back( ReplSetConfig(HostAndPort::me()) );
+                }
+                catch(DBException& e) {
+                    log() << "replSet exception loading our local replset configuration object : " << e.toString() << rsLog;
+                }
+                for( vector<HostAndPort>::const_iterator i = _seeds->begin(); i != _seeds->end(); i++ ) {
+                    try {
+                        configs.push_back( ReplSetConfig(*i) );
+                    }
+                    catch( DBException& e ) {
+                        log() << "replSet exception trying to load config from " << *i << " : " << e.toString() << rsLog;
+                    }
+                }
+                {
+                    scoped_lock lck( replSettings.discoveredSeeds_mx );
+                    if( replSettings.discoveredSeeds.size() > 0 ) {
+                        for (set<string>::iterator i = replSettings.discoveredSeeds.begin(); 
+                             i != replSettings.discoveredSeeds.end(); 
+                             i++) {
+                            try {
+                                configs.push_back( ReplSetConfig(HostAndPort(*i)) );
+                            }
+                            catch( DBException& ) {
+                                log(1) << "replSet exception trying to load config from discovered seed " << *i << rsLog;
+                                replSettings.discoveredSeeds.erase(*i);
+                            }
+                        }
+                    }
+                }
+
+                if (!replSettings.reconfig.isEmpty()) {
+                    try {
+                        configs.push_back(ReplSetConfig(replSettings.reconfig, true));
+                    }
+                    catch( DBException& re) {
+                        log() << "replSet couldn't load reconfig: " << re.what() << rsLog;
+                        replSettings.reconfig = BSONObj();
+                    }
+                }
+
+                int nok = 0;
+                int nempty = 0;
+                for( vector<ReplSetConfig>::iterator i = configs.begin(); i != configs.end(); i++ ) {
+                    if( i->ok() )
+                        nok++;
+                    if( i->empty() )
+                        nempty++;
+                }
+                if( nok == 0 ) {
+
+                    if( nempty == (int) configs.size() ) {
+                        startupStatus = EMPTYCONFIG;
+                        startupStatusMsg.set("can't get " + rsConfigNs + " config from self or any seed (EMPTYCONFIG)");
+                        log() << "replSet can't get " << rsConfigNs << " config from self or any seed (EMPTYCONFIG)" << rsLog;
+                        static unsigned once;
+                        if( ++once == 1 ) {
+                            log() << "replSet info you may need to run replSetInitiate -- rs.initiate() in the shell -- if that is not already done" << rsLog;
+                        }
+                        if( _seeds->size() == 0 ) {
+                            LOG(1) << "replSet info no seed hosts were specified on the --replSet command line" << rsLog;
+                        }
+                    }
+                    else {
+                        startupStatus = EMPTYUNREACHABLE;
+                        startupStatusMsg.set("can't currently get " + rsConfigNs + " config from self or any seed (EMPTYUNREACHABLE)");
+                        log() << "replSet can't get " << rsConfigNs << " config from self or any seed (yet)" << rsLog;
+                    }
+
+                    sleepsecs(10);
+                    continue;
+                }
+
+                if( !_loadConfigFinish(configs) ) {
+                    log() << "replSet info Couldn't load config yet. Sleeping 20sec and will try again." << rsLog;
+                    sleepsecs(20);
+                    continue;
+                }
+            }
+            catch(DBException& e) {
+                startupStatus = BADCONFIG;
+                startupStatusMsg.set("replSet error loading set config (BADCONFIG)");
+                log() << "replSet error loading configurations " << e.toString() << rsLog;
+                log() << "replSet error replication will not start" << rsLog;
+                sethbmsg("error loading set config");
+                _fatal();
+                throw;
+            }
+            break;
+        }
+        startupStatusMsg.set("? started");
+        startupStatus = STARTED;
+    }
+
+    void ReplSetImpl::_fatal() {
+        box.set(MemberState::RS_FATAL, 0);
+        log() << "replSet error fatal, stopping replication" << rsLog;
+    }
+
+    void ReplSet::haveNewConfig(ReplSetConfig& newConfig, bool addComment) {
+        bo comment;
+        if( addComment )
+            comment = BSON( "msg" << "Reconfig set" << "version" << newConfig.version );
+
+        newConfig.saveConfigLocally(comment);
+
+        try {
+            if (initFromConfig(newConfig, true)) {
+                log() << "replSet replSetReconfig new config saved locally" << rsLog;
+            }
+        }
+        catch(DBException& e) {
+            if( e.getCode() == 13497 /* removed from set */ ) {
+                cc().shutdown();
+                dbexit( EXIT_CLEAN , "removed from replica set" ); // never returns
+                assert(0);
+            }
+            log() << "replSet error unexpected exception in haveNewConfig() : " << e.toString() << rsLog;
+            _fatal();
+        }
+        catch(...) {
+            log() << "replSet error unexpected exception in haveNewConfig()" << rsLog;
+            _fatal();
+        }
+    }
+
+    void Manager::msgReceivedNewConfig(BSONObj o) {
+        log() << "replset msgReceivedNewConfig version: " << o["version"].toString() << rsLog;
+        ReplSetConfig c(o);
+        if( c.version > rs->config().version )
+            theReplSet->haveNewConfig(c, false);
+        else {
+            log() << "replSet info msgReceivedNewConfig but version isn't higher " <<
+                  c.version << ' ' << rs->config().version << rsLog;
+        }
+    }
+
+    /* forked as a thread during startup
+       it can run quite a while looking for config.  but once found,
+       a separate thread takes over as ReplSetImpl::Manager, and this thread
+       terminates.
+    */
+    void startReplSets(ReplSetCmdline *replSetCmdline) {
+        Client::initThread("rsStart");
+        try {
+            assert( theReplSet == 0 );
+            if( replSetCmdline == 0 ) {
+                assert(!replSet);
+                return;
+            }
+            replLocalAuth();
+            (theReplSet = new ReplSet(*replSetCmdline))->go();
+        }
+        catch(std::exception& e) {
+            log() << "replSet caught exception in startReplSets thread: " << e.what() << rsLog;
+            if( theReplSet )
+                theReplSet->fatal();
+        }
+        cc().shutdown();
+    }
+
+    void replLocalAuth() {
+        if ( noauth )
+            return;
+        cc().getAuthenticationInfo()->authorize("local","_repl");
+    }
+    
+
+}
+
+namespace boost {
+
+    void assertion_failed(char const * expr, char const * function, char const * file, long line) {
+        mongo::log() << "boost assertion failure " << expr << ' ' << function << ' ' << file << ' ' << line << endl;
+    }
+
+}
diff --git a/src/mongo/db/repl/rs.h b/src/mongo/db/repl/rs.h
new file mode 100644
index 00000000000..8e43204be3b
--- /dev/null
+++ b/src/mongo/db/repl/rs.h
@@ -0,0 +1,667 @@
+// /db/repl/rs.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../util/concurrency/list.h"
+#include "../../util/concurrency/value.h"
+#include "../../util/concurrency/msg.h"
+#include "../../util/net/hostandport.h"
+#include "../commands.h"
+#include "../oplog.h"
+#include "../oplogreader.h"
+#include "rs_exception.h"
+#include "rs_optime.h"
+#include "rs_member.h"
+#include "rs_config.h"
+
+/**
+ * Order of Events
+ *
+ * On startup, if the --replSet option is present, startReplSets is called.
+ * startReplSets forks off a new thread for replica set activities.  It creates
+ * the global theReplSet variable and calls go() on it.
+ *
+ * theReplSet's constructor changes the replica set's state to RS_STARTUP,
+ * starts the replica set manager, and loads the config (if the replica set
+ * has been initialized).
+ */
+
+namespace mongo {
+
+    struct HowToFixUp;
+    struct Target;
+    class DBClientConnection;
+    class ReplSetImpl;
+    class OplogReader;
+    extern bool replSet; // true if using repl sets
+    extern class ReplSet *theReplSet; // null until initialized
+    extern Tee *rsLog;
+
+    /* member of a replica set */
+    class Member : public List1<Member>::Base {
+    private:
+        ~Member(); // intentionally unimplemented as should never be called -- see List1<>::Base.
+        Member(const Member&); 
+    public:
+        Member(HostAndPort h, unsigned ord, ReplSetConfig::MemberCfg *c, bool self);
+
+        string fullName() const { return h().toString(); }
+        const ReplSetConfig::MemberCfg& config() const { return _config; }
+        ReplSetConfig::MemberCfg& configw() { return _config; }
+        const HeartbeatInfo& hbinfo() const { return _hbinfo; }
+        HeartbeatInfo& get_hbinfo() { return _hbinfo; }
+        string lhb() const { return _hbinfo.lastHeartbeatMsg; }
+        MemberState state() const { return _hbinfo.hbstate; }
+        const HostAndPort& h() const { return _h; }
+        unsigned id() const { return _hbinfo.id(); }
+
+        bool potentiallyHot() const { return _config.potentiallyHot(); } // not arbiter, not priority 0
+        void summarizeMember(stringstream& s) const;
+
+    private:
+        friend class ReplSetImpl;
+        ReplSetConfig::MemberCfg _config;
+        const HostAndPort _h;
+        HeartbeatInfo _hbinfo;
+    };
+
+    namespace replset {
+        /**
+         * "Normal" replica set syncing
+         */
+        class SyncTail : public Sync {
+        public:
+            virtual ~SyncTail() {}
+            SyncTail(const string& host) : Sync(host) {}
+            virtual bool syncApply(const BSONObj &o);
+        };
+
+        /**
+         * Initial clone and sync
+         */
+        class InitialSync : public SyncTail {
+        public:
+            InitialSync(const string& host) : SyncTail(host) {}
+            virtual ~InitialSync() {}
+            bool oplogApplication(OplogReader& r, const Member* source, const OpTime& applyGTE, const OpTime& minValid);
+            virtual void applyOp(const BSONObj& o, const OpTime& minvalid);
+        };
+
+        // TODO: move hbmsg into an error-keeping class (SERVER-4444)
+        void sethbmsg(const string& s, const int logLevel=0);
+
+    } // namespace replset
+
+    class Manager : public task::Server {
+        ReplSetImpl *rs;
+        bool busyWithElectSelf;
+        int _primary;
+
+        /** @param two - if true two primaries were seen.  this can happen transiently, in addition to our
+                         polling being only occasional.  in this case null is returned, but the caller should
+                         not assume primary itself in that situation.
+        */
+        const Member* findOtherPrimary(bool& two);
+
+        void noteARemoteIsPrimary(const Member *);
+        void checkElectableSet();
+        void checkAuth();
+        virtual void starting();
+    public:
+        Manager(ReplSetImpl *rs);
+        virtual ~Manager();
+        void msgReceivedNewConfig(BSONObj);
+        void msgCheckNewState();
+    };
+
+    class GhostSync : public task::Server {
+        struct GhostSlave : boost::noncopyable {
+            GhostSlave() : last(0), slave(0), init(false) { }
+            OplogReader reader;
+            OpTime last;
+            Member* slave;
+            bool init;
+        };
+        /**
+         * This is a cache of ghost slaves
+         */
+        typedef map< mongo::OID,shared_ptr<GhostSlave> > MAP;
+        MAP _ghostCache;
+        RWLock _lock; // protects _ghostCache
+        ReplSetImpl *rs;
+        virtual void starting();
+    public:
+        GhostSync(ReplSetImpl *_rs) : task::Server("rsGhostSync"), _lock("GhostSync"), rs(_rs) {}
+        ~GhostSync() {
+            log() << "~GhostSync() called" << rsLog;
+        }
+
+        /**
+         * Replica sets can sync in a hierarchical fashion, which throws off w
+         * calculation on the master.  percolate() faux-syncs from an upstream
+         * node so that the primary will know what the slaves are up to.
+         *
+         * We can't just directly sync to the primary because it could be
+         * unreachable, e.g., S1--->S2--->S3--->P.  S2 should ghost sync from S3
+         * and S3 can ghost sync from the primary.
+         *
+         * Say we have an S1--->S2--->P situation and this node is S2.  rid
+         * would refer to S1.  S2 would create a ghost slave of S1 and connect
+         * it to P (_currentSyncTarget). Then it would use this connection to
+         * pretend to be S1, replicating off of P.
+         */
+        void percolate(const BSONObj& rid, const OpTime& last);
+        void associateSlave(const BSONObj& rid, const int memberId);
+        void updateSlave(const mongo::OID& id, const OpTime& last);
+    };
+
+    struct Target;
+
+    class Consensus {
+        ReplSetImpl &rs;
+        struct LastYea {
+            LastYea() : when(0), who(0xffffffff) { }
+            time_t when;
+            unsigned who;
+        };
+        static SimpleMutex lyMutex;
+        Guarded<LastYea,lyMutex> ly;
+        unsigned yea(unsigned memberId); // throws VoteException
+        void electionFailed(unsigned meid);
+        void _electSelf();
+        bool weAreFreshest(bool& allUp, int& nTies);
+        bool sleptLast; // slept last elect() pass
+    public:
+        Consensus(ReplSetImpl *t) : rs(*t) {
+            sleptLast = false;
+            steppedDown = 0;
+        }
+
+        /* if we've stepped down, this is when we are allowed to try to elect ourself again.
+           todo: handle possible weirdnesses at clock skews etc.
+        */
+        time_t steppedDown;
+
+        int totalVotes() const;
+        bool aMajoritySeemsToBeUp() const;
+        bool shouldRelinquish() const;
+        void electSelf();
+        void electCmdReceived(BSONObj, BSONObjBuilder*);
+        void multiCommand(BSONObj cmd, list<Target>& L);
+    };
+
+    /**
+     * most operations on a ReplSet object should be done while locked. that
+     * logic implemented here.
+     *
+     * Order of locking: lock the replica set, then take a rwlock.
+     */
+    class RSBase : boost::noncopyable {
+    public:
+        const unsigned magic;
+        void assertValid() { assert( magic == 0x12345677 ); }
+    private:
+        mongo::mutex m;
+        int _locked;
+        ThreadLocalValue<bool> _lockedByMe;
+    protected:
+        RSBase() : magic(0x12345677), m("RSBase"), _locked(0) { }
+        ~RSBase() {
+            /* this can happen if we throw in the constructor; otherwise never happens.  thus we log it as it is quite unusual. */
+            log() << "replSet ~RSBase called" << rsLog;
+        }
+
+    public:
+        class lock {
+            RSBase& rsbase;
+            auto_ptr<scoped_lock> sl;
+        public:
+            lock(RSBase* b) : rsbase(*b) {
+                if( rsbase._lockedByMe.get() )
+                    return; // recursive is ok...
+
+                sl.reset( new scoped_lock(rsbase.m) );
+                DEV assert(rsbase._locked == 0);
+                rsbase._locked++;
+                rsbase._lockedByMe.set(true);
+            }
+            ~lock() {
+                if( sl.get() ) {
+                    assert( rsbase._lockedByMe.get() );
+                    DEV assert(rsbase._locked == 1);
+                    rsbase._lockedByMe.set(false);
+                    rsbase._locked--;
+                }
+            }
+        };
+
+        /* for asserts */
+        bool locked() const { return _locked != 0; }
+
+        /* if true, is locked, and was locked by this thread. note if false, it could be in the lock or not for another
+           just for asserts & such so we can make the contracts clear on who locks what when.
+           we don't use these locks that frequently, so the little bit of overhead is fine.
+        */
+        bool lockedByMe() { return _lockedByMe.get(); }
+    };
+
+    class ReplSetHealthPollTask;
+
+    /* safe container for our state that keeps member pointer and state variables always aligned */
+    class StateBox : boost::noncopyable {
+    public:
+        struct SP { // SP is like pair<MemberState,const Member *> but nicer
+            SP() : state(MemberState::RS_STARTUP), primary(0) { }
+            MemberState state;
+            const Member *primary;
+        };
+        const SP get() {
+            rwlock lk(m, false);
+            return sp;
+        }
+        MemberState getState() const {
+            rwlock lk(m, false);
+            return sp.state;
+        }
+        const Member* getPrimary() const {
+            rwlock lk(m, false);
+            return sp.primary;
+        }
+        void change(MemberState s, const Member *self) {
+            rwlock lk(m, true);
+            if( sp.state != s ) {
+                log() << "replSet " << s.toString() << rsLog;
+            }
+            sp.state = s;
+            if( s.primary() ) {
+                sp.primary = self;
+            }
+            else {
+                if( self == sp.primary )
+                    sp.primary = 0;
+            }
+        }
+        void set(MemberState s, const Member *p) {
+            rwlock lk(m, true);
+            sp.state = s;
+            sp.primary = p;
+        }
+        void setSelfPrimary(const Member *self) { change(MemberState::RS_PRIMARY, self); }
+        void setOtherPrimary(const Member *mem) {
+            rwlock lk(m, true);
+            assert( !sp.state.primary() );
+            sp.primary = mem;
+        }
+        void noteRemoteIsPrimary(const Member *remote) {
+            rwlock lk(m, true);
+            if( !sp.state.secondary() && !sp.state.fatal() )
+                sp.state = MemberState::RS_RECOVERING;
+            sp.primary = remote;
+        }
+        StateBox() : m("StateBox") { }
+    private:
+        RWLock m;
+        SP sp;
+    };
+
+    void parseReplsetCmdLine(string cfgString, string& setname, vector<HostAndPort>& seeds, set<HostAndPort>& seedSet );
+
+    /** Parameter given to the --replSet command line option (parsed).
+        Syntax is "<setname>/<seedhost1>,<seedhost2>"
+        where setname is a name and seedhost is "<host>[:<port>]" */
+    class ReplSetCmdline {
+    public:
+        ReplSetCmdline(string cfgString) { parseReplsetCmdLine(cfgString, setname, seeds, seedSet); }
+        string setname;
+        vector<HostAndPort> seeds;
+        set<HostAndPort> seedSet;
+    };
+
+    /* information about the entire repl set, such as the various servers in the set, and their state */
+    /* note: We currently do not free mem when the set goes away - it is assumed the replset is a
+             singleton and long lived.
+    */
+    class ReplSetImpl : protected RSBase {
+    public:
+        /** info on our state if the replset isn't yet "up".  for example, if we are pre-initiation. */
+        enum StartupStatus {
+            PRESTART=0, LOADINGCONFIG=1, BADCONFIG=2, EMPTYCONFIG=3,
+            EMPTYUNREACHABLE=4, STARTED=5, SOON=6
+        };
+        static StartupStatus startupStatus;
+        static DiagStr startupStatusMsg;
+        static string stateAsHtml(MemberState state);
+
+        /* todo thread */
+        void msgUpdateHBInfo(HeartbeatInfo);
+
+        StateBox box;
+
+        OpTime lastOpTimeWritten;
+        long long lastH; // hash we use to make sure we are reading the right flow of ops and aren't on an out-of-date "fork"
+    private:
+        set<ReplSetHealthPollTask*> healthTasks;
+        void endOldHealthTasks();
+        void startHealthTaskFor(Member *m);
+
+        Consensus elect;
+        void relinquish();
+        void forgetPrimary();
+    protected:
+        bool _stepDown(int secs);
+        bool _freeze(int secs);
+    private:
+        void assumePrimary();
+        void loadLastOpTimeWritten(bool quiet=false);
+        void changeState(MemberState s);
+        
+        /**
+         * Find the closest member (using ping time) with a higher latest optime.
+         */
+        Member* getMemberToSyncTo();
+        void veto(const string& host, unsigned secs=10);
+        Member* _currentSyncTarget;
+
+        bool _blockSync;
+        void blockSync(bool block);
+
+        // set of electable members' _ids
+        set<unsigned> _electableSet;
+    protected:
+        // "heartbeat message"
+        // sent in requestHeartbeat respond in field "hbm"
+        char _hbmsg[256]; // we change this unlocked, thus not an stl::string
+        time_t _hbmsgTime; // when it was logged
+    public:
+        void sethbmsg(string s, int logLevel = 0);
+
+        /**
+         * Election with Priorities
+         *
+         * Each node (n) keeps a set of nodes that could be elected primary.
+         * Each node in this set:
+         *
+         *  1. can connect to a majority of the set
+         *  2. has a priority greater than 0
+         *  3. has an optime within 10 seconds of the most up-to-date node
+         *     that n can reach
+         *
+         * If a node fails to meet one or more of these criteria, it is removed
+         * from the list.  This list is updated whenever the node receives a
+         * heartbeat.
+         *
+         * When a node sends an "am I freshest?" query, the node receiving the
+         * query checks their electable list to make sure that no one else is
+         * electable AND higher priority.  If this check passes, the node will
+         * return an "ok" response, if not, it will veto.
+         *
+         * If a node is primary and there is another node with higher priority
+         * on the electable list (i.e., it must be synced to within 10 seconds
+         * of the current primary), the node (or nodes) with connections to both
+         * the primary and the secondary with higher priority will issue
+         * replSetStepDown requests to the primary to allow the higher-priority
+         * node to take over.  
+         */
+        void addToElectable(const unsigned m) { lock lk(this); _electableSet.insert(m); }
+        void rmFromElectable(const unsigned m) { lock lk(this); _electableSet.erase(m); }
+        bool iAmElectable() { lock lk(this); return _electableSet.find(_self->id()) != _electableSet.end(); }
+        bool isElectable(const unsigned id) { lock lk(this); return _electableSet.find(id) != _electableSet.end(); }
+        Member* getMostElectable();
+    protected:
+        /**
+         * Load a new config as the replica set's main config.
+         *
+         * If there is a "simple" change (just adding a node), this shortcuts
+         * the config. Returns true if the config was changed.  Returns false
+         * if the config doesn't include a this node.  Throws an exception if
+         * something goes very wrong.
+         *
+         * Behavior to note:
+         *  - locks this
+         *  - intentionally leaks the old _cfg and any old _members (if the
+         *    change isn't strictly additive)
+         */
+        bool initFromConfig(ReplSetConfig& c, bool reconf=false); 
+        void _fillIsMaster(BSONObjBuilder&);
+        void _fillIsMasterHost(const Member*, vector<string>&, vector<string>&, vector<string>&);
+        const ReplSetConfig& config() { return *_cfg; }
+        string name() const { return _name; } /* @return replica set's logical name */
+        MemberState state() const { return box.getState(); }
+        void _fatal();
+        void _getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const;
+        void _summarizeAsHtml(stringstream&) const;
+        void _summarizeStatus(BSONObjBuilder&) const; // for replSetGetStatus command
+
+        /* throws exception if a problem initializing. */
+        ReplSetImpl(ReplSetCmdline&);
+
+        /* call afer constructing to start - returns fairly quickly after launching its threads */
+        void _go();
+
+    private:
+        string _name;
+        const vector<HostAndPort> *_seeds;
+        ReplSetConfig *_cfg;
+
+        /**
+         * Finds the configuration with the highest version number and attempts
+         * load it.
+         */
+        bool _loadConfigFinish(vector<ReplSetConfig>& v);
+        /**
+         * Gather all possible configs (from command line seeds, our own config
+         * doc, and any hosts listed therein) and try to initiate from the most
+         * recent config we find.
+         */
+        void loadConfig();
+
+        list<HostAndPort> memberHostnames() const;
+        const ReplSetConfig::MemberCfg& myConfig() const { return _config; }
+        bool iAmArbiterOnly() const { return myConfig().arbiterOnly; }
+        bool iAmPotentiallyHot() const {
+          return myConfig().potentiallyHot() && // not an arbiter
+            elect.steppedDown <= time(0) && // not stepped down/frozen
+            state() == MemberState::RS_SECONDARY; // not stale
+        }
+    protected:
+        Member *_self;
+        bool _buildIndexes;       // = _self->config().buildIndexes
+        void setSelfTo(Member *); // use this as it sets buildIndexes var
+    private:
+        List1<Member> _members; // all members of the set EXCEPT _self.
+        ReplSetConfig::MemberCfg _config; // config of _self
+        unsigned _id; // _id of _self
+
+        int _maintenanceMode; // if we should stay in recovering state
+    public:
+        // this is called from within a writelock in logOpRS
+        unsigned selfId() const { return _id; }
+        Manager *mgr;
+        GhostSync *ghost;
+        /**
+         * This forces a secondary to go into recovering state and stay there
+         * until this is called again, passing in "false".  Multiple threads can
+         * call this and it will leave maintenance mode once all of the callers
+         * have called it again, passing in false.
+         */
+        void setMaintenanceMode(const bool inc);
+    private:
+        Member* head() const { return _members.head(); }
+    public:
+        const Member* findById(unsigned id) const;
+    private:
+        void _getTargets(list<Target>&, int &configVersion);
+        void getTargets(list<Target>&, int &configVersion);
+        void startThreads();
+        friend class FeedbackThread;
+        friend class CmdReplSetElect;
+        friend class Member;
+        friend class Manager;
+        friend class GhostSync;
+        friend class Consensus;
+
+    private:
+        bool initialSyncOplogApplication(const OpTime& applyGTE, const OpTime& minValid);
+        void _syncDoInitialSync();
+        void syncDoInitialSync();
+        void _syncThread();
+        bool tryToGoLiveAsASecondary(OpTime&); // readlocks
+        void syncTail();
+        unsigned _syncRollback(OplogReader& r);
+        void syncRollback(OplogReader& r);
+        void syncFixUp(HowToFixUp& h, OplogReader& r);
+
+        // get an oplog reader for a server with an oplog entry timestamp greater
+        // than or equal to minTS, if set.
+        Member* _getOplogReader(OplogReader& r, const OpTime& minTS);
+
+        // check lastOpTimeWritten against the remote's earliest op, filling in
+        // remoteOldestOp.
+        bool _isStale(OplogReader& r, const OpTime& minTS, BSONObj& remoteOldestOp);
+
+        // keep a list of hosts that we've tried recently that didn't work
+        map<string,time_t> _veto;
+    public:
+        void syncThread();
+        const OpTime lastOtherOpTime() const;
+    };
+
+    class ReplSet : public ReplSetImpl {
+    public:
+        ReplSet(ReplSetCmdline& replSetCmdline) : ReplSetImpl(replSetCmdline) {  }
+
+        // for the replSetStepDown command
+        bool stepDown(int secs) { return _stepDown(secs); }
+
+        // for the replSetFreeze command
+        bool freeze(int secs) { return _freeze(secs); }
+
+        string selfFullName() {
+            assert( _self );
+            return _self->fullName();
+        }
+
+        bool buildIndexes() const { return _buildIndexes; }
+
+        /* call after constructing to start - returns fairly quickly after la[unching its threads */
+        void go() { _go(); }
+
+        void fatal() { _fatal(); }
+        bool isPrimary() { return box.getState().primary(); }
+        bool isSecondary() {  return box.getState().secondary(); }
+        MemberState state() const { return ReplSetImpl::state(); }
+        string name() const { return ReplSetImpl::name(); }
+        const ReplSetConfig& config() { return ReplSetImpl::config(); }
+        void getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const { _getOplogDiagsAsHtml(server_id,ss); }
+        void summarizeAsHtml(stringstream& ss) const { _summarizeAsHtml(ss); }
+        void summarizeStatus(BSONObjBuilder& b) const  { _summarizeStatus(b); }
+        void fillIsMaster(BSONObjBuilder& b) { _fillIsMaster(b); }
+
+        /**
+         * We have a new config (reconfig) - apply it.
+         * @param comment write a no-op comment to the oplog about it.  only
+         * makes sense if one is primary and initiating the reconf.
+         *
+         * The slaves are updated when they get a heartbeat indicating the new
+         * config.  The comment is a no-op.
+         */
+        void haveNewConfig(ReplSetConfig& c, bool comment);
+
+        /**
+         * Pointer assignment isn't necessarily atomic, so this needs to assure
+         * locking, even though we don't delete old configs.
+         */
+        const ReplSetConfig& getConfig() { return config(); }
+
+        bool lockedByMe() { return RSBase::lockedByMe(); }
+
+        // heartbeat msg to send to others; descriptive diagnostic info
+        string hbmsg() const {
+            if( time(0)-_hbmsgTime > 120 ) return "";
+            return _hbmsg;
+        }
+    };
+
+    /**
+     * Base class for repl set commands.  Checks basic things such if we're in
+     * rs mode before the command does its real work.
+     */
+    class ReplSetCommand : public Command {
+    protected:
+        ReplSetCommand(const char * s, bool show=false) : Command(s, show) { }
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual bool logTheOp() { return false; }
+        virtual LockType locktype() const { return NONE; }
+        virtual void help( stringstream &help ) const { help << "internal"; }
+
+        /**
+         * Some replica set commands call this and then call check(). This is
+         * intentional, as they might do things before theReplSet is initialized
+         * that still need to be checked for auth.
+         */
+        bool checkAuth(string& errmsg, BSONObjBuilder& result) {
+            if( !noauth ) {
+                AuthenticationInfo *ai = cc().getAuthenticationInfo();
+                if (!ai->isAuthorizedForLock("admin", locktype())) {
+                    errmsg = "replSet command unauthorized";
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        bool check(string& errmsg, BSONObjBuilder& result) {
+            if( !replSet ) {
+                errmsg = "not running with --replSet";
+                if( cmdLine.configsvr ) { 
+                    result.append("info", "configsvr"); // for shell prompt
+                }
+                return false;
+            }
+
+            if( theReplSet == 0 ) {
+                result.append("startupStatus", ReplSet::startupStatus);
+                string s;
+                errmsg = ReplSet::startupStatusMsg.empty() ? "replset unknown error 2" : ReplSet::startupStatusMsg.get();
+                if( ReplSet::startupStatus == 3 )
+                    result.append("info", "run rs.initiate(...) if not yet done for the set");
+                return false;
+            }
+
+            return checkAuth(errmsg, result);
+        }
+    };
+
+    /**
+     * does local authentication
+     * directly authorizes against AuthenticationInfo
+     */
+    void replLocalAuth();
+
+    /** inlines ----------------- */
+
+    inline Member::Member(HostAndPort h, unsigned ord, ReplSetConfig::MemberCfg *c, bool self) :
+        _config(*c), _h(h), _hbinfo(ord) {
+        assert(c);
+        if( self )
+            _hbinfo.health = 1.0;
+    }
+
+}
diff --git a/src/mongo/db/repl/rs_config.cpp b/src/mongo/db/repl/rs_config.cpp
new file mode 100644
index 00000000000..22137773aec
--- /dev/null
+++ b/src/mongo/db/repl/rs_config.cpp
@@ -0,0 +1,662 @@
+// rs_config.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "rs.h"
+#include "../../client/dbclient.h"
+#include "../../client/syncclusterconnection.h"
+#include "../../util/net/hostandport.h"
+#include "../dbhelpers.h"
+#include "connections.h"
+#include "../oplog.h"
+#include "../instance.h"
+#include "../../util/text.h"
+#include <boost/algorithm/string.hpp>
+
+using namespace bson;
+
+namespace mongo {
+
+    void logOpInitiate(const bo&);
+
+    void assertOnlyHas(BSONObj o, const set<string>& fields) {
+        BSONObj::iterator i(o);
+        while( i.more() ) {
+            BSONElement e = i.next();
+            if( !fields.count( e.fieldName() ) ) {
+                uasserted(13434, str::stream() << "unexpected field '" << e.fieldName() << "' in object");
+            }
+        }
+    }
+
+    list<HostAndPort> ReplSetConfig::otherMemberHostnames() const {
+        list<HostAndPort> L;
+        for( vector<MemberCfg>::const_iterator i = members.begin(); i != members.end(); i++ ) {
+            if( !i->h.isSelf() )
+                L.push_back(i->h);
+        }
+        return L;
+    }
+
+    /* comment MUST only be set when initiating the set by the initiator */
+    void ReplSetConfig::saveConfigLocally(bo comment) {
+        checkRsConfig();
+        log() << "replSet info saving a newer config version to local.system.replset" << rsLog;
+        {
+            writelock lk("");
+            Client::Context cx( rsConfigNs );
+            cx.db()->flushFiles(true);
+
+            //theReplSet->lastOpTimeWritten = ??;
+            //rather than above, do a logOp()? probably
+            BSONObj o = asBson();
+            Helpers::putSingletonGod(rsConfigNs.c_str(), o, false/*logOp=false; local db so would work regardless...*/);
+            if( !comment.isEmpty() && (!theReplSet || theReplSet->isPrimary()) )
+                logOpInitiate(comment);
+
+            cx.db()->flushFiles(true);
+        }
+        log() << "replSet saveConfigLocally done" << rsLog;
+    }
+
+    bo ReplSetConfig::MemberCfg::asBson() const {
+        bob b;
+        b << "_id" << _id;
+        b.append("host", h.dynString());
+        if( votes != 1 ) b << "votes" << votes;
+        if( priority != 1.0 ) b << "priority" << priority;
+        if( arbiterOnly ) b << "arbiterOnly" << true;
+        if( slaveDelay ) b << "slaveDelay" << slaveDelay;
+        if( hidden ) b << "hidden" << hidden;
+        if( !buildIndexes ) b << "buildIndexes" << buildIndexes;
+        if( !tags.empty() ) {
+            BSONObjBuilder a;
+            for( map<string,string>::const_iterator i = tags.begin(); i != tags.end(); i++ )
+                a.append((*i).first, (*i).second);
+            b.append("tags", a.done());
+        }
+        return b.obj();
+    }
+
+    void ReplSetConfig::updateMembers(List1<Member> &dest) {
+        for (vector<MemberCfg>::iterator source = members.begin(); source < members.end(); source++) {
+            for( Member *d = dest.head(); d; d = d->next() ) {
+                if (d->fullName() == (*source).h.toString()) {
+                    d->configw().groupsw() = (*source).groups();
+                }
+            }
+        }
+    }
+
+    bo ReplSetConfig::asBson() const {
+        bob b;
+        b.append("_id", _id).append("version", version);
+
+        BSONArrayBuilder a;
+        for( unsigned i = 0; i < members.size(); i++ )
+            a.append( members[i].asBson() );
+        b.append("members", a.arr());
+
+        if( !ho.isDefault() || !getLastErrorDefaults.isEmpty() || !rules.empty()) {
+            bob settings;
+            if( !rules.empty() ) {
+                bob modes;
+                for (map<string,TagRule*>::const_iterator it = rules.begin(); it != rules.end(); it++) {
+                    bob clauses;
+                    vector<TagClause*> r = (*it).second->clauses;
+                    for (vector<TagClause*>::iterator it2 = r.begin(); it2 < r.end(); it2++) {
+                        clauses << (*it2)->name << (*it2)->target;
+                    }
+                    modes << (*it).first << clauses.obj();
+                }
+                settings << "getLastErrorModes" << modes.obj();
+            }
+            if( !getLastErrorDefaults.isEmpty() )
+                settings << "getLastErrorDefaults" << getLastErrorDefaults;
+            b << "settings" << settings.obj();
+        }
+
+        return b.obj();
+    }
+
+    static inline void mchk(bool expr) {
+        uassert(13126, "bad Member config", expr);
+    }
+
+    void ReplSetConfig::MemberCfg::check() const {
+        mchk(_id >= 0 && _id <= 255);
+        mchk(priority >= 0 && priority <= 1000);
+        mchk(votes <= 100); // votes >= 0 because it is unsigned
+        uassert(13419, "priorities must be between 0.0 and 100.0", priority >= 0.0 && priority <= 100.0);
+        uassert(13437, "slaveDelay requires priority be zero", slaveDelay == 0 || priority == 0);
+        uassert(13438, "bad slaveDelay value", slaveDelay >= 0 && slaveDelay <= 3600 * 24 * 366);
+        uassert(13439, "priority must be 0 when hidden=true", priority == 0 || !hidden);
+        uassert(13477, "priority must be 0 when buildIndexes=false", buildIndexes || priority == 0);
+    }
+/*
+    string ReplSetConfig::TagSubgroup::toString() const {
+        bool first = true;
+        string result = "\""+name+"\": [";
+        for (set<const MemberCfg*>::const_iterator i = m.begin(); i != m.end(); i++) {
+            if (!first) {
+                result += ", ";
+            }
+            first = false;
+            result += (*i)->h.toString();
+        }
+        return result+"]";
+    }
+    */
+    string ReplSetConfig::TagClause::toString() const {
+        string result = name+": {";
+        for (map<string,TagSubgroup*>::const_iterator i = subgroups.begin(); i != subgroups.end(); i++) {
+//TEMP?            result += (*i).second->toString()+", ";
+        }
+        result += "TagClause toString TEMPORARILY DISABLED";
+        return result + "}";
+    }
+
+    string ReplSetConfig::TagRule::toString() const {
+        string result = "{";
+        for (vector<TagClause*>::const_iterator it = clauses.begin(); it < clauses.end(); it++) {
+            result += ((TagClause*)(*it))->toString()+",";
+        }
+        return result+"}";
+    }
+
+    void ReplSetConfig::TagSubgroup::updateLast(const OpTime& op) {
+        RACECHECK
+        if (last < op) {
+            last = op;
+
+            for (vector<TagClause*>::iterator it = clauses.begin(); it < clauses.end(); it++) {
+                (*it)->updateLast(op);
+            }
+        }
+    }
+
+    void ReplSetConfig::TagClause::updateLast(const OpTime& op) {
+        RACECHECK
+        if (last >= op) {
+            return;
+        }
+
+        // check at least n subgroups greater than clause.last
+        int count = 0;
+        map<string,TagSubgroup*>::iterator it;
+        for (it = subgroups.begin(); it != subgroups.end(); it++) {
+            if ((*it).second->last >= op) {
+                count++;
+            }
+        }
+
+        if (count >= actualTarget) {
+            last = op;
+            rule->updateLast(op);
+        }
+    }
+
+    void ReplSetConfig::TagRule::updateLast(const OpTime& op) {
+        OpTime *earliest = (OpTime*)&op;
+        vector<TagClause*>::iterator it;
+
+        for (it = clauses.begin(); it < clauses.end(); it++) {
+            if ((*it)->last < *earliest) {
+                earliest = &(*it)->last;
+            }
+        }
+
+        // rules are simply and-ed clauses, so whatever the most-behind
+        // clause is at is what the rule is at
+        last = *earliest;
+    }
+
+    /** @param o old config
+        @param n new config
+        */
+    /*static*/
+    bool ReplSetConfig::legalChange(const ReplSetConfig& o, const ReplSetConfig& n, string& errmsg) {
+        assert( theReplSet );
+
+        if( o._id != n._id ) {
+            errmsg = "set name may not change";
+            return false;
+        }
+        /* TODO : wonder if we need to allow o.version < n.version only, which is more lenient.
+                  if someone had some intermediate config this node doesnt have, that could be
+                  necessary.  but then how did we become primary?  so perhaps we are fine as-is.
+                  */
+        if( o.version >= n.version ) {
+            errmsg = str::stream() << "version number must increase, old: "
+                                   << o.version << " new: " << n.version;
+            return false;
+        }
+
+        map<HostAndPort,const ReplSetConfig::MemberCfg*> old;
+        bool isLocalHost = false;
+        for( vector<ReplSetConfig::MemberCfg>::const_iterator i = o.members.begin(); i != o.members.end(); i++ ) {
+            if (i->h.isLocalHost()) {
+                isLocalHost = true;
+            }
+            old[i->h] = &(*i);
+        }
+        int me = 0;
+        for( vector<ReplSetConfig::MemberCfg>::const_iterator i = n.members.begin(); i != n.members.end(); i++ ) {
+            const ReplSetConfig::MemberCfg& m = *i;
+            if ( (isLocalHost && !m.h.isLocalHost()) || (!isLocalHost && m.h.isLocalHost())) {
+                log() << "reconfig error, cannot switch between localhost and hostnames: "
+                      << m.h.toString() << rsLog;
+                uasserted(13645, "hosts cannot switch between localhost and hostname");
+            }
+            if( old.count(m.h) ) {
+                const ReplSetConfig::MemberCfg& oldCfg = *old[m.h];
+                if( oldCfg._id != m._id ) {
+                    log() << "replSet reconfig error with member: " << m.h.toString() << rsLog;
+                    uasserted(13432, "_id may not change for members");
+                }
+                if( oldCfg.buildIndexes != m.buildIndexes ) {
+                    log() << "replSet reconfig error with member: " << m.h.toString() << rsLog;
+                    uasserted(13476, "buildIndexes may not change for members");
+                }
+                /* are transitions to and from arbiterOnly guaranteed safe?  if not, we should disallow here.
+                   there is a test at replsets/replsetarb3.js */
+                if( oldCfg.arbiterOnly != m.arbiterOnly ) {
+                    log() << "replSet reconfig error with member: " << m.h.toString() << " arbiterOnly cannot change. remove and readd the member instead " << rsLog;
+                    uasserted(13510, "arbiterOnly may not change for members");
+                }
+            }
+            if( m.h.isSelf() )
+                me++;
+        }
+
+        uassert(13433, "can't find self in new replset config", me == 1);
+
+        return true;
+    }
+
+    void ReplSetConfig::clear() {
+        version = -5;
+        _ok = false;
+    }
+
+    void ReplSetConfig::setMajority() {
+        int total = members.size();
+        int nonArbiters = total;
+        int strictMajority = total/2+1;
+
+        for (vector<MemberCfg>::iterator it = members.begin(); it < members.end(); it++) {
+            if ((*it).arbiterOnly) {
+                nonArbiters--;
+            }
+        }
+
+        // majority should be all "normal" members if we have something like 4
+        // arbiters & 3 normal members
+        _majority = (strictMajority > nonArbiters) ? nonArbiters : strictMajority;
+    }
+
+    int ReplSetConfig::getMajority() const {
+        return _majority;
+    }
+
+    void ReplSetConfig::checkRsConfig() const {
+        uassert(13132,
+                str::stream() << "nonmatching repl set name in _id field: " << _id << " vs. " << cmdLine.ourSetName(),
+                _id == cmdLine.ourSetName());
+        uassert(13308, "replSet bad config version #", version > 0);
+        uassert(13133, "replSet bad config no members", members.size() >= 1);
+        uassert(13309, "replSet bad config maximum number of members is 12", members.size() <= 12);
+        {
+            unsigned voters = 0;
+            for( vector<MemberCfg>::const_iterator i = members.begin(); i != members.end(); ++i ) {
+                if( i->votes )
+                    voters++;
+            }
+            uassert(13612, "replSet bad config maximum number of voting members is 7", voters <= 7);
+            uassert(13613, "replSet bad config no voting members", voters > 0);
+        }
+    }
+
+    void ReplSetConfig::_populateTagMap(map<string,TagClause> &tagMap) {
+        // create subgroups for each server corresponding to each of
+        // its tags. E.g.:
+        //
+        // A is tagged with {"server" : "A", "dc" : "ny"}
+        // B is tagged with {"server" : "B", "dc" : "ny"}
+        //
+        // At the end of this step, tagMap will contain:
+        //
+        // "server" => {"A" : [A], "B" : [B]}
+        // "dc" => {"ny" : [A,B]}
+
+        for (unsigned i=0; i<members.size(); i++) {
+            MemberCfg member = members[i];
+
+            for (map<string,string>::iterator tag = member.tags.begin(); tag != member.tags.end(); tag++) {
+                string label = (*tag).first;
+                string value = (*tag).second;
+
+                TagClause& clause = tagMap[label];
+                clause.name = label;
+
+                TagSubgroup* subgroup;
+                // search for "ny" in "dc"'s clause
+                if (clause.subgroups.find(value) == clause.subgroups.end()) {
+                    clause.subgroups[value] = subgroup = new TagSubgroup(value);
+                }
+                else {
+                    subgroup = clause.subgroups[value];
+                }
+
+                subgroup->m.insert(&members[i]);
+            }
+        }
+    }
+
+    void ReplSetConfig::parseRules(const BSONObj& modes) {
+        map<string,TagClause> tagMap;
+        _populateTagMap(tagMap);
+
+        for (BSONObj::iterator i = modes.begin(); i.more(); ) {
+            unsigned int primaryOnly = 0;
+
+            // ruleName : {dc : 2, m : 3}
+            BSONElement rule = i.next();
+            uassert(14046, "getLastErrorMode rules must be objects", rule.type() == mongo::Object);
+
+            TagRule* r = new TagRule();
+
+            BSONObj clauseObj = rule.Obj();
+            for (BSONObj::iterator c = clauseObj.begin(); c.more(); ) {
+                BSONElement clauseElem = c.next();
+                uassert(14829, "getLastErrorMode criteria must be numeric", clauseElem.isNumber());
+
+                // get the clause, e.g., "x.y" : 3
+                const char *criteria = clauseElem.fieldName();
+                int value = clauseElem.numberInt();
+                uassert(14828, str::stream() << "getLastErrorMode criteria must be greater than 0: " << clauseElem, value > 0);
+
+                TagClause* node = new TagClause(tagMap[criteria]);
+
+                int numGroups = node->subgroups.size();
+                uassert(14831, str::stream() << "mode " << clauseObj << " requires "
+                        << value << " tagged with " << criteria << ", but only "
+                        << numGroups << " with this tag were found", numGroups >= value);
+
+                node->name = criteria;
+                node->target = value;
+                // if any subgroups contain "me", we can decrease the target
+                node->actualTarget = node->target;
+
+                // then we want to add pointers between clause & subgroup
+                for (map<string,TagSubgroup*>::iterator sgs = node->subgroups.begin();
+                     sgs != node->subgroups.end(); sgs++) {
+                    bool foundMe = false;
+                    (*sgs).second->clauses.push_back(node);
+
+                    // if this subgroup contains the primary, it's automatically always up-to-date
+                    for( set<MemberCfg*>::const_iterator cfg = (*sgs).second->m.begin();
+                         cfg != (*sgs).second->m.end(); 
+                         cfg++) 
+                    {
+                        if ((*cfg)->h.isSelf()) {
+                            node->actualTarget--;
+                            foundMe = true;
+                        }
+                    }
+
+                    for (set<MemberCfg *>::iterator cfg = (*sgs).second->m.begin();
+                         !foundMe && cfg != (*sgs).second->m.end(); cfg++) {
+                        (*cfg)->groupsw().insert((*sgs).second);
+                    }
+                }
+
+                // if all of the members of this clause involve the primary, it's always up-to-date
+                if (node->actualTarget == 0) {
+                    node->last = OpTime(INT_MAX, INT_MAX);
+                    primaryOnly++;
+                }
+
+                // this is a valid clause, so we want to add it to its rule
+                node->rule = r;
+                r->clauses.push_back(node);
+            }
+
+            // if all of the clauses are satisfied by the primary, this rule is trivially true
+            if (primaryOnly == r->clauses.size()) {
+                r->last = OpTime(INT_MAX, INT_MAX);
+            }
+
+            // if we got here, this is a valid rule
+            LOG(1) << "replSet new rule " << rule.fieldName() << ": " << r->toString() << rsLog;
+            rules[rule.fieldName()] = r;
+        }
+    }
+
+    void ReplSetConfig::from(BSONObj o) {
+        static const string legal[] = {"_id","version", "members","settings"};
+        static const set<string> legals(legal, legal + 4);
+        assertOnlyHas(o, legals);
+
+        md5 = o.md5();
+        _id = o["_id"].String();
+        if( o["version"].ok() ) {
+            version = o["version"].numberInt();
+            uassert(13115, "bad " + rsConfigNs + " config: version", version > 0);
+        }
+
+        set<string> hosts;
+        set<int> ords;
+        vector<BSONElement> members;
+        try {
+            members = o["members"].Array();
+        }
+        catch(...) {
+            uasserted(13131, "replSet error parsing (or missing) 'members' field in config object");
+        }
+
+        unsigned localhosts = 0;
+        for( unsigned i = 0; i < members.size(); i++ ) {
+            BSONObj mobj = members[i].Obj();
+            MemberCfg m;
+            try {
+                static const string legal[] = {
+                    "_id","votes","priority","host", "hidden","slaveDelay",
+                    "arbiterOnly","buildIndexes","tags","initialSync" // deprecated
+                };
+                static const set<string> legals(legal, legal + 10);
+                assertOnlyHas(mobj, legals);
+
+                try {
+                    m._id = (int) mobj["_id"].Number();
+                }
+                catch(...) {
+                    /* TODO: use of string exceptions may be problematic for reconfig case! */
+                    throw "_id must be numeric";
+                }
+                try {
+                    string s = mobj["host"].String();
+                    boost::trim(s);
+                    m.h = HostAndPort(s);
+                    if ( !m.h.hasPort() ) {
+                        // make port explicit even if default 
+                        m.h.setPort(m.h.port());
+                    }
+                }
+                catch(...) {
+                    throw string("bad or missing host field? ") + mobj.toString();
+                }
+                if( m.h.isLocalHost() )
+                    localhosts++;
+                m.arbiterOnly = mobj["arbiterOnly"].trueValue();
+                m.slaveDelay = mobj["slaveDelay"].numberInt();
+                if( mobj.hasElement("hidden") )
+                    m.hidden = mobj["hidden"].trueValue();
+                if( mobj.hasElement("buildIndexes") )
+                    m.buildIndexes = mobj["buildIndexes"].trueValue();
+                if( mobj.hasElement("priority") )
+                    m.priority = mobj["priority"].Number();
+                if( mobj.hasElement("votes") )
+                    m.votes = (unsigned) mobj["votes"].Number();
+                if( mobj.hasElement("tags") ) {
+                    const BSONObj &t = mobj["tags"].Obj();
+                    for (BSONObj::iterator c = t.begin(); c.more(); c.next()) {
+                        m.tags[(*c).fieldName()] = (*c).String();
+                    }
+                    uassert(14827, "arbiters cannot have tags", !m.arbiterOnly || m.tags.empty() );
+                }
+                m.check();
+            }
+            catch( const char * p ) {
+                log() << "replSet cfg parsing exception for members[" << i << "] " << p << rsLog;
+                stringstream ss;
+                ss << "replSet members[" << i << "] " << p;
+                uassert(13107, ss.str(), false);
+            }
+            catch(DBException& e) {
+                log() << "replSet cfg parsing exception for members[" << i << "] " << e.what() << rsLog;
+                stringstream ss;
+                ss << "bad config for member[" << i << "] " << e.what();
+                uassert(13135, ss.str(), false);
+            }
+            if( !(ords.count(m._id) == 0 && hosts.count(m.h.toString()) == 0) ) {
+                log() << "replSet " << o.toString() << rsLog;
+                uassert(13108, "bad replset config -- duplicate hosts in the config object?", false);
+            }
+            hosts.insert(m.h.dynString());
+            ords.insert(m._id);
+            this->members.push_back(m);
+        }
+        uassert(13393, "can't use localhost in repl set member names except when using it for all members", localhosts == 0 || localhosts == members.size());
+        uassert(13117, "bad " + rsConfigNs + " config", !_id.empty());
+
+        if( o["settings"].ok() ) {
+            BSONObj settings = o["settings"].Obj();
+            if( settings["getLastErrorModes"].ok() ) {
+                parseRules(settings["getLastErrorModes"].Obj());
+            }
+            ho.check();
+            try { getLastErrorDefaults = settings["getLastErrorDefaults"].Obj().copy(); }
+            catch(...) { }
+        }
+
+        // figure out the majority for this config
+        setMajority();
+    }
+
+    static inline void configAssert(bool expr) {
+        uassert(13122, "bad repl set config?", expr);
+    }
+
+    ReplSetConfig::ReplSetConfig(BSONObj cfg, bool force) {
+        _constructed = false;
+        clear();
+        from(cfg);
+        if( force ) {
+            version += rand() % 100000 + 10000;
+        }
+        configAssert( version < 0 /*unspecified*/ || (version >= 1) );
+        if( version < 1 )
+            version = 1;
+        _ok = true;
+        _constructed = true;
+    }
+
+    ReplSetConfig::ReplSetConfig(const HostAndPort& h) {
+        LOG(2) << "ReplSetConfig load " << h.toStringLong() << rsLog;
+
+        _constructed = false;
+        clear();
+        int level = 2;
+        DEV level = 0;
+
+        BSONObj cfg;
+        int v = -5;
+        try {
+            if( h.isSelf() ) {
+                ;
+            }
+            else {
+                /* first, make sure other node is configured to be a replset. just to be safe. */
+                string setname = cmdLine.ourSetName();
+                BSONObj cmd = BSON( "replSetHeartbeat" << setname );
+                int theirVersion;
+                BSONObj info;
+                log() << "trying to contact " << h.toString() << rsLog;
+                bool ok = requestHeartbeat(setname, "", h.toString(), info, -2, theirVersion);
+                if( info["rs"].trueValue() ) {
+                    // yes, it is a replicate set, although perhaps not yet initialized
+                }
+                else {
+                    if( !ok ) {
+                        log() << "replSet TEMP !ok heartbeating " << h.toString() << " on cfg load" << rsLog;
+                        if( !info.isEmpty() )
+                            log() << "replSet info " << h.toString() << " : " << info.toString() << rsLog;
+                        return;
+                    }
+                    {
+                        stringstream ss;
+                        ss << "replSet error: member " << h.toString() << " is not in --replSet mode";
+                        msgassertedNoTrace(13260, ss.str().c_str()); // not caught as not a user exception - we want it not caught
+                        //for python err# checker: uassert(13260, "", false);
+                    }
+                }
+            }
+
+            v = -4;
+            unsigned long long count = 0;
+            try {
+                ScopedConn conn(h.toString());
+                v = -3;
+                cfg = conn.findOne(rsConfigNs, Query()).getOwned();
+                count = conn.count(rsConfigNs);
+            }
+            catch ( DBException& ) {
+                if ( !h.isSelf() ) {
+                    throw;
+                }
+
+                // on startup, socket is not listening yet
+                DBDirectClient cli;
+                cfg = cli.findOne( rsConfigNs, Query() ).getOwned();
+                count = cli.count(rsConfigNs);
+            }
+
+            if( count > 1 )
+                uasserted(13109, str::stream() << "multiple rows in " << rsConfigNs << " not supported host: " << h.toString());
+
+            if( cfg.isEmpty() ) {
+                version = EMPTYCONFIG;
+                return;
+            }
+            version = -1;
+        }
+        catch( DBException& e) {
+            version = v;
+            log(level) << "replSet load config couldn't get from " << h.toString() << ' ' << e.what() << rsLog;
+            return;
+        }
+
+        from(cfg);
+        checkRsConfig();
+        _ok = true;
+        log(level) << "replSet load config ok from " << (h.isSelf() ? "self" : h.toString()) << rsLog;
+        _constructed = true;
+    }
+
+}
diff --git a/src/mongo/db/repl/rs_config.h b/src/mongo/db/repl/rs_config.h
new file mode 100644
index 00000000000..cfe2e86a568
--- /dev/null
+++ b/src/mongo/db/repl/rs_config.h
@@ -0,0 +1,251 @@
+// rs_config.h
+// repl set configuration
+//
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../../util/net/hostandport.h"
+#include "../../util/concurrency/race.h"
+#include "health.h"
+
+namespace mongo {
+    class Member;
+    const string rsConfigNs = "local.system.replset";
+
+    class ReplSetConfig {
+        enum { EMPTYCONFIG = -2 };
+        struct TagSubgroup;
+    public:
+        /**
+         * This contacts the given host and tries to get a config from them.
+         *
+         * This sends a test heartbeat to the host and, if all goes well and the
+         * host has a more recent config, fetches the config and loads it (see
+         * from().
+         *
+         * If it's contacting itself, it skips the heartbeat (for obvious
+         * reasons.) If something is misconfigured, throws an exception. If the
+         * host couldn't be queried or is just blank, ok() will be false.
+         */
+        ReplSetConfig(const HostAndPort& h);
+
+        ReplSetConfig(BSONObj cfg, bool force=false);
+
+        bool ok() const { return _ok; }
+
+        struct TagRule;
+
+        struct MemberCfg {
+            MemberCfg() : _id(-1), votes(1), priority(1.0), arbiterOnly(false), slaveDelay(0), hidden(false), buildIndexes(true) { }
+            int _id;              /* ordinal */
+            unsigned votes;       /* how many votes this node gets. default 1. */
+            HostAndPort h;
+            double priority;      /* 0 means can never be primary */
+            bool arbiterOnly;
+            int slaveDelay;       /* seconds.  int rather than unsigned for convenient to/front bson conversion. */
+            bool hidden;          /* if set, don't advertise to drives in isMaster. for non-primaries (priority 0) */
+            bool buildIndexes;    /* if false, do not create any non-_id indexes */
+            map<string,string> tags;     /* tagging for data center, rack, etc. */
+        private:
+            set<TagSubgroup*> _groups; // the subgroups this member belongs to
+        public:
+            const set<TagSubgroup*>& groups() const { 
+                return _groups;
+            }
+            set<TagSubgroup*>& groupsw() {
+                return _groups;
+            }
+            void check() const;   /* check validity, assert if not. */
+            BSONObj asBson() const;
+            bool potentiallyHot() const { return !arbiterOnly && priority > 0; }
+            void updateGroups(const OpTime& last) {
+                RACECHECK
+                for (set<TagSubgroup*>::const_iterator it = groups().begin(); it != groups().end(); it++) {
+                    ((TagSubgroup*)(*it))->updateLast(last);
+                }
+            }
+            bool operator==(const MemberCfg& r) const {
+                if (!tags.empty() || !r.tags.empty()) {
+                    if (tags.size() != r.tags.size()) {
+                        return false;
+                    }
+
+                    // if they are the same size and not equal, at least one
+                    // element in A must be different in B
+                    for (map<string,string>::const_iterator lit = tags.begin(); lit != tags.end(); lit++) {
+                        map<string,string>::const_iterator rit = r.tags.find((*lit).first);
+
+                        if (rit == r.tags.end() || (*lit).second != (*rit).second) {
+                            return false;
+                        }
+                    }
+                }
+
+                return _id==r._id && votes == r.votes && h == r.h && priority == r.priority &&
+                       arbiterOnly == r.arbiterOnly && slaveDelay == r.slaveDelay && hidden == r.hidden &&
+                       buildIndexes == buildIndexes;
+            }
+            bool operator!=(const MemberCfg& r) const { return !(*this == r); }
+        };
+
+        vector<MemberCfg> members;
+        string _id;
+        int version;
+        HealthOptions ho;
+        string md5;
+        BSONObj getLastErrorDefaults;
+        map<string,TagRule*> rules;
+
+        list<HostAndPort> otherMemberHostnames() const; // except self
+
+        /** @return true if could connect, and there is no cfg object there at all */
+        bool empty() const { return version == EMPTYCONFIG; }
+
+        string toString() const { return asBson().toString(); }
+
+        /** validate the settings. does not call check() on each member, you have to do that separately. */
+        void checkRsConfig() const;
+
+        /** check if modification makes sense */
+        static bool legalChange(const ReplSetConfig& old, const ReplSetConfig& n, string& errmsg);
+
+        //static void receivedNewConfig(BSONObj);
+        void saveConfigLocally(BSONObj comment); // to local db
+        string saveConfigEverywhere(); // returns textual info on what happened
+
+        /**
+         * Update members' groups when the config changes but members stay the same.
+         */
+        void updateMembers(List1<Member> &dest);
+
+        BSONObj asBson() const;
+
+        /**
+         * Getter and setter for _majority. This is almost always
+         * members.size()/2+1, but can be the number of non-arbiter members if
+         * there are more arbiters than non-arbiters (writing to 3 out of 7
+         * servers is safe if 4 of the servers are arbiters).
+         */
+        void setMajority();
+        int getMajority() const;
+
+        bool _constructed;
+    private:
+        bool _ok;
+        int _majority;
+
+        void from(BSONObj);
+        void clear();
+
+        struct TagClause;
+
+        /**
+         * This is a logical grouping of servers.  It is pointed to by a set of
+         * servers with a certain tag.
+         *
+         * For example, suppose servers A, B, and C have the tag "dc" : "nyc". If we
+         * have a rule {"dc" : 2}, then we want A _or_ B _or_ C to have the
+         * write for one of the "dc" critiria to be fulfilled, so all three will
+         * point to this subgroup. When one of their oplog-tailing cursors is
+         * updated, this subgroup is updated.
+         */
+        struct TagSubgroup : boost::noncopyable {
+            ~TagSubgroup(); // never called; not defined
+            TagSubgroup(string nm) : name(nm) { }
+            const string name;
+            OpTime last;
+            vector<TagClause*> clauses;
+
+            // this probably won't actually point to valid members after the
+            // subgroup is created, as initFromConfig() makes a copy of the
+            // config
+            set<MemberCfg*> m;
+
+            void updateLast(const OpTime& op);
+
+            //string toString() const;
+
+            /**
+             * If two tags have the same name, they should compare as equal so
+             * that members don't have to update two identical groups on writes.
+             */
+            bool operator() (TagSubgroup& lhs, TagSubgroup& rhs) const {
+                return lhs.name < rhs.name;
+            }
+        };
+
+        /**
+         * An argument in a rule.  For example, if we had the rule {dc : 2,
+         * machines : 3}, "dc" : 2 and "machines" : 3 would be two TagClauses.
+         *
+         * Each tag clause has a set of associated subgroups.  For example, if
+         * we had "dc" : 2, our subgroups might be "nyc", "sf", and "hk".
+         */
+        struct TagClause {
+            OpTime last;
+            map<string,TagSubgroup*> subgroups;
+            TagRule *rule;
+            string name;
+            /**
+             * If we have get a clause like {machines : 3} and this server is
+             * tagged with "machines", then it's really {machines : 2}, as we
+             * will always be up-to-date.  So, target would be 3 and
+             * actualTarget would be 2, in that example.
+             */
+            int target;
+            int actualTarget;
+
+            void updateLast(const OpTime& op);
+            string toString() const;
+        };
+
+        /**
+         * Parses getLastErrorModes.
+         */
+        void parseRules(const BSONObj& modes);
+
+        /**
+         * Create a  hash containing every possible clause that could be used in a
+         * rule and the servers related to that clause.
+         *
+         * For example, suppose we have the following servers:
+         * A {"dc" : "ny", "ny" : "rk1"}
+         * B {"dc" : "ny", "ny" : "rk1"}
+         * C {"dc" : "ny", "ny" : "rk2"}
+         * D {"dc" : "sf", "sf" : "rk1"}
+         * E {"dc" : "sf", "sf" : "rk2"}
+         *
+         * This would give us the possible criteria:
+         * "dc" -> {A, B, C},{D, E}
+         * "ny" -> {A, B},{C}
+         * "sf" -> {D},{E}
+         */
+        void _populateTagMap(map<string,TagClause> &tagMap);
+
+    public:
+        struct TagRule {
+            vector<TagClause*> clauses;
+            OpTime last;
+
+            void updateLast(const OpTime& op);
+            string toString() const;
+        };
+    };
+
+}
diff --git a/src/mongo/db/repl/rs_exception.h b/src/mongo/db/repl/rs_exception.h
new file mode 100644
index 00000000000..fc372fc241c
--- /dev/null
+++ b/src/mongo/db/repl/rs_exception.h
@@ -0,0 +1,17 @@
+// @file rs_exception.h
+
+#pragma once
+
+namespace mongo {
+
+    class VoteException : public std::exception {
+    public:
+        const char * what() const throw () { return "VoteException"; }
+    };
+
+    class RetryAfterSleepException : public std::exception {
+    public:
+        const char * what() const throw () { return "RetryAfterSleepException"; }
+    };
+
+}
diff --git a/src/mongo/db/repl/rs_initialsync.cpp b/src/mongo/db/repl/rs_initialsync.cpp
new file mode 100644
index 00000000000..b67c0d71b83
--- /dev/null
+++ b/src/mongo/db/repl/rs_initialsync.cpp
@@ -0,0 +1,271 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../repl.h"
+#include "../client.h"
+#include "../../client/dbclient.h"
+#include "rs.h"
+#include "../oplogreader.h"
+#include "../../util/mongoutils/str.h"
+#include "../dbhelpers.h"
+#include "rs_optime.h"
+#include "../oplog.h"
+
+namespace mongo {
+
+    using namespace mongoutils;
+    using namespace bson;
+
+    void dropAllDatabasesExceptLocal();
+
+    // add try/catch with sleep
+
+    void isyncassert(const string& msg, bool expr) {
+        if( !expr ) {
+            string m = str::stream() << "initial sync " << msg;
+            theReplSet->sethbmsg(m, 0);
+            uasserted(13404, m);
+        }
+    }
+
+    void ReplSetImpl::syncDoInitialSync() {
+        createOplog();
+
+        while( 1 ) {
+            try {
+                _syncDoInitialSync();
+                break;
+            }
+            catch(DBException& e) {
+                sethbmsg("initial sync exception " + e.toString(), 0);
+                sleepsecs(30);
+            }
+        }
+    }
+
+    /* todo : progress metering to sethbmsg. */
+    static bool clone(const char *master, string db) {
+        string err;
+        return cloneFrom(master, err, db, false,
+                         /* slave_ok */ true, true, false, /*mayYield*/true, /*mayBeInterrupted*/false);
+    }
+
+    void _logOpObjRS(const BSONObj& op);
+
+    static void emptyOplog() {
+        writelock lk(rsoplog);
+        Client::Context ctx(rsoplog);
+        NamespaceDetails *d = nsdetails(rsoplog);
+
+        // temp
+        if( d && d->stats.nrecords == 0 )
+            return; // already empty, ok.
+
+        LOG(1) << "replSet empty oplog" << rsLog;
+        d->emptyCappedCollection(rsoplog);
+    }
+
+    Member* ReplSetImpl::getMemberToSyncTo() {
+        Member *closest = 0;
+        time_t now = 0;
+        bool buildIndexes = true;
+
+        // wait for 2N pings before choosing a sync target
+        if (_cfg) {
+            int needMorePings = config().members.size()*2 - HeartbeatInfo::numPings;
+
+            if (needMorePings > 0) {
+                OCCASIONALLY log() << "waiting for " << needMorePings << " pings from other members before syncing" << endl;
+                return NULL;
+            }
+
+            buildIndexes = myConfig().buildIndexes;
+        }
+
+        // find the member with the lowest ping time that has more data than me
+        for (Member *m = _members.head(); m; m = m->next()) {
+            if (m->hbinfo().up() &&
+                // make sure members with buildIndexes sync from other members w/indexes
+                (!buildIndexes || (buildIndexes && m->config().buildIndexes)) &&
+                (m->state() == MemberState::RS_PRIMARY ||
+                 (m->state() == MemberState::RS_SECONDARY && m->hbinfo().opTime > lastOpTimeWritten)) &&
+                (!closest || m->hbinfo().ping < closest->hbinfo().ping)) {
+
+                map<string,time_t>::iterator vetoed = _veto.find(m->fullName());
+                if (vetoed == _veto.end()) {
+                    closest = m;
+                    break;
+                }
+
+                if (now == 0) {
+                    now = time(0);
+                }
+
+                // if this was on the veto list, check if it was vetoed in the last "while"
+                if ((*vetoed).second < now) {
+                    _veto.erase(vetoed);
+                    closest = m;
+                    break;
+                }
+
+                // if it was recently vetoed, skip
+                log() << "replSet not trying to sync from " << (*vetoed).first
+                      << ", it is vetoed for " << ((*vetoed).second - now) << " more seconds" << rsLog;
+            }
+        }
+
+        {
+            lock lk(this);        
+
+            if (!closest) {
+                _currentSyncTarget = NULL;
+                return NULL;
+            }
+            
+            _currentSyncTarget = closest;
+        }
+
+        sethbmsg( str::stream() << "syncing to: " << closest->fullName(), 0);
+
+        return closest;
+    }
+
+    void ReplSetImpl::veto(const string& host, const unsigned secs) {
+        _veto[host] = time(0)+secs;
+    }
+
+    /**
+     * Do the initial sync for this member.
+     */
+    void ReplSetImpl::_syncDoInitialSync() {
+        sethbmsg("initial sync pending",0);
+
+        // if this is the first node, it may have already become primary
+        if ( box.getState().primary() ) {
+            sethbmsg("I'm already primary, no need for initial sync",0);
+            return;
+        }
+        
+        const Member *source = getMemberToSyncTo();
+        if (!source) {
+            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
+            sleepsecs(15);
+            return;
+        }
+
+        string sourceHostname = source->h().toString();
+        OplogReader r;
+        if( !r.connect(sourceHostname) ) {
+            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
+            sleepsecs(15);
+            return;
+        }
+
+        BSONObj lastOp = r.getLastOp(rsoplog);
+        if( lastOp.isEmpty() ) {
+            sethbmsg("initial sync couldn't read remote oplog", 0);
+            sleepsecs(15);
+            return;
+        }
+        OpTime startingTS = lastOp["ts"]._opTime();
+
+        if (replSettings.fastsync) {
+            log() << "fastsync: skipping database clone" << rsLog;
+        }
+        else {
+            sethbmsg("initial sync drop all databases", 0);
+            dropAllDatabasesExceptLocal();
+
+            sethbmsg("initial sync clone all databases", 0);
+
+            list<string> dbs = r.conn()->getDatabaseNames();
+            for( list<string>::iterator i = dbs.begin(); i != dbs.end(); i++ ) {
+                string db = *i;
+                if( db != "local" ) {
+                    sethbmsg( str::stream() << "initial sync cloning db: " << db , 0);
+                    bool ok;
+                    {
+                        writelock lk(db);
+                        Client::Context ctx(db);
+                        ok = clone(sourceHostname.c_str(), db);
+                    }
+                    if( !ok ) {
+                        sethbmsg( str::stream() << "initial sync error clone of " << db << " failed sleeping 5 minutes" ,0);
+                        veto(source->fullName(), 600);
+                        sleepsecs(300);
+                        return;
+                    }
+                }
+            }
+        }
+
+        sethbmsg("initial sync query minValid",0);
+
+        /* our cloned copy will be strange until we apply oplog events that occurred
+           through the process.  we note that time point here. */
+        BSONObj minValid = r.getLastOp(rsoplog);
+        isyncassert( "getLastOp is empty ", !minValid.isEmpty() );
+        OpTime mvoptime = minValid["ts"]._opTime();
+        assert( !mvoptime.isNull() );
+        assert( mvoptime >= startingTS );
+
+        // apply startingTS..mvoptime portion of the oplog
+        {
+            // note we assume here that this call does not throw
+            if( ! initialSyncOplogApplication(startingTS, mvoptime) ) {
+                log() << "replSet initial sync failed during oplog application phase" << rsLog;
+
+                emptyOplog(); // otherwise we'll be up!
+                
+                lastOpTimeWritten = OpTime();
+                lastH = 0;
+                
+                log() << "replSet cleaning up [1]" << rsLog;
+                {
+                    writelock lk("local.");
+                    Client::Context cx( "local." );
+                    cx.db()->flushFiles(true);
+                }
+                log() << "replSet cleaning up [2]" << rsLog;
+
+                log() << "replSet initial sync failed will try again" << endl;
+
+                sleepsecs(5);
+                return;
+            }
+        }
+
+        sethbmsg("initial sync finishing up",0);
+
+        assert( !box.getState().primary() ); // wouldn't make sense if we were.
+
+        {
+            writelock lk("local.");
+            Client::Context cx( "local." );
+            cx.db()->flushFiles(true);
+            try {
+                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
+            }
+            catch(...) { }
+            Helpers::putSingleton("local.replset.minvalid", minValid);
+            cx.db()->flushFiles(true);
+        }
+
+        sethbmsg("initial sync done",0);
+    }
+
+}
diff --git a/src/mongo/db/repl/rs_initiate.cpp b/src/mongo/db/repl/rs_initiate.cpp
new file mode 100644
index 00000000000..77bc6c03938
--- /dev/null
+++ b/src/mongo/db/repl/rs_initiate.cpp
@@ -0,0 +1,269 @@
+/* @file rs_initiate.cpp
+   */
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../cmdline.h"
+#include "../commands.h"
+#include "../../util/mmap.h"
+#include "../../util/mongoutils/str.h"
+#include "health.h"
+#include "rs.h"
+#include "rs_config.h"
+#include "../dbhelpers.h"
+#include "../oplog.h"
+
+using namespace bson;
+using namespace mongoutils;
+
+namespace mongo {
+
+    /* called on a reconfig AND on initiate
+       throws
+       @param initial true when initiating
+    */
+    void checkMembersUpForConfigChange(const ReplSetConfig& cfg, BSONObjBuilder& result, bool initial) {
+        int failures = 0, allVotes = 0, allowableFailures = 0;
+        int me = 0;
+        stringstream selfs;
+        for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) {
+            if( i->h.isSelf() ) {
+                me++;
+                if( me > 1 )
+                    selfs << ',';
+                selfs << i->h.toString();
+                if( !i->potentiallyHot() ) {
+                    uasserted(13420, "initiation and reconfiguration of a replica set must be sent to a node that can become primary");
+                }
+            }
+            allVotes += i->votes;
+        }
+        allowableFailures = allVotes - (allVotes/2 + 1);
+
+        uassert(13278, "bad config: isSelf is true for multiple hosts: " + selfs.str(), me <= 1); // dups?
+        if( me != 1 ) {
+            stringstream ss;
+            ss << "can't find self in the replset config";
+            if( !cmdLine.isDefaultPort() ) ss << " my port: " << cmdLine.port;
+            if( me != 0 ) ss << " found: " << me;
+            uasserted(13279, ss.str());
+        }
+
+        vector<string> down;
+        for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) {
+            // we know we're up
+            if (i->h.isSelf()) {
+                continue;
+            }
+
+            BSONObj res;
+            {
+                bool ok = false;
+                try {
+                    int theirVersion = -1000;
+                    ok = requestHeartbeat(cfg._id, "", i->h.toString(), res, -1, theirVersion, initial/*check if empty*/);
+                    if( theirVersion >= cfg.version ) {
+                        stringstream ss;
+                        ss << "replSet member " << i->h.toString() << " has too new a config version (" << theirVersion << ") to reconfigure";
+                        uasserted(13259, ss.str());
+                    }
+                }
+                catch(DBException& e) {
+                    log() << "replSet cmufcc requestHeartbeat " << i->h.toString() << " : " << e.toString() << rsLog;
+                }
+                catch(...) {
+                    log() << "replSet cmufcc error exception in requestHeartbeat?" << rsLog;
+                }
+                if( res.getBoolField("mismatch") )
+                    uasserted(13145, "set name does not match the set name host " + i->h.toString() + " expects");
+                if( *res.getStringField("set") ) {
+                    if( cfg.version <= 1 ) {
+                        // this was to be initiation, no one shoudl be initiated already.
+                        uasserted(13256, "member " + i->h.toString() + " is already initiated");
+                    }
+                    else {
+                        // Assure no one has a newer config.
+                        if( res["v"].Int() >= cfg.version ) {
+                            uasserted(13341, "member " + i->h.toString() + " has a config version >= to the new cfg version; cannot change config");
+                        }
+                    }
+                }
+                if( !ok && !res["rs"].trueValue() ) {
+                    down.push_back(i->h.toString());
+
+                    if( !res.isEmpty() ) {
+                        /* strange.  got a response, but not "ok". log it. */
+                        log() << "replSet warning " << i->h.toString() << " replied: " << res.toString() << rsLog;
+                    }
+
+                    bool allowFailure = false;
+                    failures += i->votes;
+                    if( !initial && failures <= allowableFailures ) {
+                        const Member* m = theReplSet->findById( i->_id );
+                        if( m ) {
+                            assert( m->h().toString() == i->h.toString() );
+                        }
+                        // it's okay if the down member isn't part of the config,
+                        // we might be adding a new member that isn't up yet
+                        allowFailure = true;
+                    }
+
+                    if( !allowFailure ) {
+                        string msg = string("need all members up to initiate, not ok : ") + i->h.toStringLong();
+                        if( !initial )
+                            msg = string("need most members up to reconfigure, not ok : ") + i->h.toString();
+                        uasserted(13144, msg);
+                    }
+                }
+            }
+            if( initial ) {
+                bool hasData = res["hasData"].Bool();
+                uassert(13311, "member " + i->h.toString() + " has data already, cannot initiate set.  All members except initiator must be empty.",
+                        !hasData || i->h.isSelf());
+            }
+        }
+        if (down.size() > 0) {
+            result.append("down", down);
+        }
+    }
+
+    class CmdReplSetInitiate : public ReplSetCommand {
+    public:
+        virtual LockType locktype() const { return NONE; }
+        CmdReplSetInitiate() : ReplSetCommand("replSetInitiate") { }
+        virtual void help(stringstream& h) const {
+            h << "Initiate/christen a replica set.";
+            h << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands";
+        }
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            log() << "replSet replSetInitiate admin command received from client" << rsLog;
+
+            if( !replSet ) {
+                errmsg = "server is not running with --replSet";
+                return false;
+            }
+            if( theReplSet ) {
+                errmsg = "already initialized";
+                result.append("info", "try querying " + rsConfigNs + " to see current configuration");
+                return false;
+            }
+
+            {
+                // just make sure we can get a write lock before doing anything else.  we'll reacquire one
+                // later.  of course it could be stuck then, but this check lowers the risk if weird things
+                // are up.
+                time_t t = time(0);
+                writelock lk("");
+                if( time(0)-t > 10 ) {
+                    errmsg = "took a long time to get write lock, so not initiating.  Initiate when server less busy?";
+                    return false;
+                }
+
+                /* check that we don't already have an oplog.  that could cause issues.
+                   it is ok if the initiating member has *other* data than that.
+                   */
+                BSONObj o;
+                if( Helpers::getFirst(rsoplog, o) ) {
+                    errmsg = rsoplog + string(" is not empty on the initiating member.  cannot initiate.");
+                    return false;
+                }
+            }
+
+            if( ReplSet::startupStatus == ReplSet::BADCONFIG ) {
+                errmsg = "server already in BADCONFIG state (check logs); not initiating";
+                result.append("info", ReplSet::startupStatusMsg.get());
+                return false;
+            }
+            if( ReplSet::startupStatus != ReplSet::EMPTYCONFIG ) {
+                result.append("startupStatus", ReplSet::startupStatus);
+                errmsg = "all members and seeds must be reachable to initiate set";
+                result.append("info", cmdLine._replSet);
+                return false;
+            }
+
+            BSONObj configObj;
+
+            if( cmdObj["replSetInitiate"].type() != Object ) {
+                result.append("info2", "no configuration explicitly specified -- making one");
+                log() << "replSet info initiate : no configuration specified.  Using a default configuration for the set" << rsLog;
+
+                string name;
+                vector<HostAndPort> seeds;
+                set<HostAndPort> seedSet;
+                parseReplsetCmdLine(cmdLine._replSet, name, seeds, seedSet); // may throw...
+
+                bob b;
+                b.append("_id", name);
+                bob members;
+                members.append("0", BSON( "_id" << 0 << "host" << HostAndPort::Me().dynString() ));
+                result.append("me", HostAndPort::Me().toString());
+                for( unsigned i = 0; i < seeds.size(); i++ )
+                    members.append(bob::numStr(i+1), BSON( "_id" << i+1 << "host" << seeds[i].toString()));
+                b.appendArray("members", members.obj());
+                configObj = b.obj();
+                log() << "replSet created this configuration for initiation : " << configObj.toString() << rsLog;
+            }
+            else {
+                configObj = cmdObj["replSetInitiate"].Obj();
+            }
+
+            bool parsed = false;
+            try {
+                ReplSetConfig newConfig(configObj);
+                parsed = true;
+
+                if( newConfig.version > 1 ) {
+                    errmsg = "can't initiate with a version number greater than 1";
+                    return false;
+                }
+
+                log() << "replSet replSetInitiate config object parses ok, " << newConfig.members.size() << " members specified" << rsLog;
+
+                checkMembersUpForConfigChange(newConfig, result, true);
+
+                log() << "replSet replSetInitiate all members seem up" << rsLog;
+
+                createOplog();
+
+                writelock lk("");
+                bo comment = BSON( "msg" << "initiating set");
+                newConfig.saveConfigLocally(comment);
+                log() << "replSet replSetInitiate config now saved locally.  Should come online in about a minute." << rsLog;
+                result.append("info", "Config now saved locally.  Should come online in about a minute.");
+                ReplSet::startupStatus = ReplSet::SOON;
+                ReplSet::startupStatusMsg.set("Received replSetInitiate - should come online shortly.");
+            }
+            catch( DBException& e ) {
+                log() << "replSet replSetInitiate exception: " << e.what() << rsLog;
+                if( !parsed )
+                    errmsg = string("couldn't parse cfg object ") + e.what();
+                else
+                    errmsg = string("couldn't initiate : ") + e.what();
+                return false;
+            }
+            catch( string& e2 ) {
+                log() << e2 << rsLog;
+                errmsg = e2;
+                return false;
+            }
+
+            return true;
+        }
+    } cmdReplSetInitiate;
+
+}
diff --git a/src/mongo/db/repl/rs_member.h b/src/mongo/db/repl/rs_member.h
new file mode 100644
index 00000000000..24e593392b6
--- /dev/null
+++ b/src/mongo/db/repl/rs_member.h
@@ -0,0 +1,131 @@
+// @file rsmember.h
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** replica set member */
+
+#pragma once
+
+#include "../../util/concurrency/value.h"
+
+namespace mongo {
+
+
+    /*
+        RS_STARTUP    serving still starting up, or still trying to initiate the set
+        RS_PRIMARY    this server thinks it is primary
+        RS_SECONDARY  this server thinks it is a secondary (slave mode)
+        RS_RECOVERING recovering/resyncing; after recovery usually auto-transitions to secondary
+        RS_FATAL      something bad has occurred and server is not completely offline with regard to the replica set.  fatal error.
+        RS_STARTUP2   loaded config, still determining who is primary
+    */
+    struct MemberState {
+        enum MS {
+            RS_STARTUP = 0,
+            RS_PRIMARY = 1,
+            RS_SECONDARY = 2,
+            RS_RECOVERING = 3,
+            RS_FATAL = 4,
+            RS_STARTUP2 = 5,
+            RS_UNKNOWN = 6, /* remote node not yet reached */
+            RS_ARBITER = 7,
+            RS_DOWN = 8, /* node not reachable for a report */
+            RS_ROLLBACK = 9
+        } s;
+
+        MemberState(MS ms = RS_UNKNOWN) : s(ms) { }
+        explicit MemberState(int ms) : s((MS) ms) { }
+
+        bool startup() const { return s == RS_STARTUP; }
+        bool primary() const { return s == RS_PRIMARY; }
+        bool secondary() const { return s == RS_SECONDARY; }
+        bool recovering() const { return s == RS_RECOVERING; }
+        bool startup2() const { return s == RS_STARTUP2; }
+        bool fatal() const { return s == RS_FATAL; }
+        bool rollback() const { return s == RS_ROLLBACK; }
+        bool readable() const { return s == RS_PRIMARY || s == RS_SECONDARY; }
+
+        string toString() const;
+
+        bool operator==(const MemberState& r) const { return s == r.s; }
+        bool operator!=(const MemberState& r) const { return s != r.s; }
+    };
+
+    /* this is supposed to be just basic information on a member,
+       and copy constructable. */
+    class HeartbeatInfo {
+        unsigned _id;
+    public:
+        HeartbeatInfo() : _id(0xffffffff), hbstate(MemberState::RS_UNKNOWN), health(-1.0),
+            downSince(0), skew(INT_MIN), authIssue(false), ping(0) { }
+        HeartbeatInfo(unsigned id);
+        unsigned id() const { return _id; }
+        MemberState hbstate;
+        double health;
+        time_t upSince;
+        long long downSince;
+        time_t lastHeartbeat;
+        DiagStr lastHeartbeatMsg;
+        OpTime opTime;
+        int skew;
+        bool authIssue;
+        unsigned int ping; // milliseconds
+        static unsigned int numPings;
+
+        bool up() const { return health > 0; }
+
+        /** health is set to -1 on startup.  that means we haven't even checked yet.  0 means we checked and it failed. */
+        bool maybeUp() const { return health != 0; }
+
+        long long timeDown() const; // ms
+
+        /* true if changed in a way of interest to the repl set manager. */
+        bool changed(const HeartbeatInfo& old) const;
+    };
+
+    inline HeartbeatInfo::HeartbeatInfo(unsigned id) : 
+        _id(id), 
+        authIssue(false),
+        ping(0) {
+        hbstate = MemberState::RS_UNKNOWN;
+        health = -1.0;
+        downSince = 0;
+        lastHeartbeat = upSince = 0;
+        skew = INT_MIN;
+    }
+
+    inline bool HeartbeatInfo::changed(const HeartbeatInfo& old) const {
+        return health != old.health ||
+               hbstate != old.hbstate;
+    }
+
+    inline string MemberState::toString() const {
+        switch ( s ) {
+        case RS_STARTUP: return "STARTUP";
+        case RS_PRIMARY: return "PRIMARY";
+        case RS_SECONDARY: return "SECONDARY";
+        case RS_RECOVERING: return "RECOVERING";
+        case RS_FATAL: return "FATAL";
+        case RS_STARTUP2: return "STARTUP2";
+        case RS_ARBITER: return "ARBITER";
+        case RS_DOWN: return "DOWN";
+        case RS_ROLLBACK: return "ROLLBACK";
+        case RS_UNKNOWN: return "UNKNOWN";
+        }
+        return "";
+    }
+
+}
diff --git a/src/mongo/db/repl/rs_optime.h b/src/mongo/db/repl/rs_optime.h
new file mode 100644
index 00000000000..f0ca56927ad
--- /dev/null
+++ b/src/mongo/db/repl/rs_optime.h
@@ -0,0 +1,58 @@
+// @file rs_optime.h
+
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../../util/optime.h"
+
+namespace mongo {
+
+    const char rsoplog[] = "local.oplog.rs";
+
+    /*
+    class RSOpTime : public OpTime {
+    public:
+        bool initiated() const { return getSecs() != 0; }
+    };*/
+
+    /*struct RSOpTime {
+        unsigned long long ord;
+
+        RSOpTime() : ord(0) { }
+
+        bool initiated() const { return ord > 0; }
+
+        void initiate() {
+            assert( !initiated() );
+            ord = 1000000;
+        }
+
+        ReplTime inc() {
+            DEV assertInWriteLock();
+            return ++ord;
+        }
+
+        string toString() const { return str::stream() << ord; }
+
+        // query the oplog and set the highest value herein.  acquires a db read lock. throws.
+        void load();
+    };
+
+    extern RSOpTime rsOpTime;*/
+
+}
diff --git a/src/mongo/db/repl/rs_rollback.cpp b/src/mongo/db/repl/rs_rollback.cpp
new file mode 100644
index 00000000000..10727c59669
--- /dev/null
+++ b/src/mongo/db/repl/rs_rollback.cpp
@@ -0,0 +1,667 @@
+/* @file rs_rollback.cpp
+*
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../client.h"
+#include "../../client/dbclient.h"
+#include "rs.h"
+#include "../repl.h"
+#include "../ops/query.h"
+#include "../cloner.h"
+#include "../ops/update.h"
+#include "../ops/delete.h"
+
+/* Scenarios
+
+   We went offline with ops not replicated out.
+
+       F = node that failed and coming back.
+       P = node that took over, new primary
+
+   #1:
+       F : a b c d e f g
+       P : a b c d q
+
+   The design is "keep P".  One could argue here that "keep F" has some merits, however, in most cases P
+   will have significantly more data.  Also note that P may have a proper subset of F's stream if there were
+   no subsequent writes.
+
+   For now the model is simply : get F back in sync with P.  If P was really behind or something, we should have
+   just chosen not to fail over anyway.
+
+   #2:
+       F : a b c d e f g                -> a b c d
+       P : a b c d
+
+   #3:
+       F : a b c d e f g                -> a b c d q r s t u v w x z
+       P : a b c d.q r s t u v w x z
+
+   Steps
+    find an event in common. 'd'.
+    undo our events beyond that by:
+      (1) taking copy from other server of those objects
+      (2) do not consider copy valid until we pass reach an optime after when we fetched the new version of object
+          -- i.e., reset minvalid.
+      (3) we could skip operations on objects that are previous in time to our capture of the object as an optimization.
+
+*/
+
+namespace mongo {
+
+    using namespace bson;
+
+    void incRBID();
+
+    class rsfatal : public std::exception {
+    public:
+        virtual const char* what() const throw() { return "replica set fatal exception"; }
+    };
+
+    struct DocID {
+        const char *ns;
+        be _id;
+        bool operator<(const DocID& d) const {
+            int c = strcmp(ns, d.ns);
+            if( c < 0 ) return true;
+            if( c > 0 ) return false;
+            return _id < d._id;
+        }
+    };
+
+    struct HowToFixUp {
+        /* note this is a set -- if there are many $inc's on a single document we need to rollback, we only
+           need to refetch it once. */
+        set<DocID> toRefetch;
+
+        /* collections to drop */
+        set<string> toDrop;
+
+        set<string> collectionsToResync;
+
+        OpTime commonPoint;
+        DiskLoc commonPointOurDiskloc;
+
+        int rbid; // remote server's current rollback sequence #
+    };
+
+    static void refetch(HowToFixUp& h, const BSONObj& ourObj) {
+        const char *op = ourObj.getStringField("op");
+        if( *op == 'n' )
+            return;
+
+        unsigned long long totSize = 0;
+        totSize += ourObj.objsize();
+        if( totSize > 512 * 1024 * 1024 )
+            throw "rollback too large";
+
+        DocID d;
+        // NOTE The assigned ns value may become invalid if we yield.
+        d.ns = ourObj.getStringField("ns");
+        if( *d.ns == 0 ) {
+            log() << "replSet WARNING ignoring op on rollback no ns TODO : " << ourObj.toString() << rsLog;
+            return;
+        }
+
+        bo o = ourObj.getObjectField(*op=='u' ? "o2" : "o");
+        if( o.isEmpty() ) {
+            log() << "replSet warning ignoring op on rollback : " << ourObj.toString() << rsLog;
+            return;
+        }
+
+        if( *op == 'c' ) {
+            be first = o.firstElement();
+            NamespaceString s(d.ns); // foo.$cmd
+            string cmdname = first.fieldName();
+            Command *cmd = Command::findCommand(cmdname.c_str());
+            if( cmd == 0 ) {
+                log() << "replSet warning rollback no suchcommand " << first.fieldName() << " - different mongod versions perhaps?" << rsLog;
+                return;
+            }
+            else {
+                /* findandmodify - tranlated?
+                   godinsert?,
+                   renamecollection a->b.  just resync a & b
+                */
+                if( cmdname == "create" ) {
+                    /* Create collection operation
+                       { ts: ..., h: ..., op: "c", ns: "foo.$cmd", o: { create: "abc", ... } }
+                    */
+                    string ns = s.db + '.' + o["create"].String(); // -> foo.abc
+                    h.toDrop.insert(ns);
+                    return;
+                }
+                else if( cmdname == "drop" ) {
+                    string ns = s.db + '.' + first.valuestr();
+                    h.collectionsToResync.insert(ns);
+                    return;
+                }
+                else if( cmdname == "dropIndexes" || cmdname == "deleteIndexes" ) {
+                    /* TODO: this is bad.  we simply full resync the collection here, which could be very slow. */
+                    log() << "replSet info rollback of dropIndexes is slow in this version of mongod" << rsLog;
+                    string ns = s.db + '.' + first.valuestr();
+                    h.collectionsToResync.insert(ns);
+                    return;
+                }
+                else if( cmdname == "renameCollection" ) {
+                    /* TODO: slow. */
+                    log() << "replSet info rollback of renameCollection is slow in this version of mongod" << rsLog;
+                    string from = first.valuestr();
+                    string to = o["to"].String();
+                    h.collectionsToResync.insert(from);
+                    h.collectionsToResync.insert(to);
+                    return;
+                }
+                else if( cmdname == "reIndex" ) {
+                    return;
+                }
+                else if( cmdname == "dropDatabase" ) {
+                    log() << "replSet error rollback : can't rollback drop database full resync will be required" << rsLog;
+                    log() << "replSet " << o.toString() << rsLog;
+                    throw rsfatal();
+                }
+                else {
+                    log() << "replSet error can't rollback this command yet: " << o.toString() << rsLog;
+                    log() << "replSet cmdname=" << cmdname << rsLog;
+                    throw rsfatal();
+                }
+            }
+        }
+
+        d._id = o["_id"];
+        if( d._id.eoo() ) {
+            log() << "replSet WARNING ignoring op on rollback no _id TODO : " << d.ns << ' '<< ourObj.toString() << rsLog;
+            return;
+        }
+
+        h.toRefetch.insert(d);
+    }
+
+    int getRBID(DBClientConnection*);
+
+    static void syncRollbackFindCommonPoint(DBClientConnection *them, HowToFixUp& h) {
+        static time_t last;
+        if( time(0)-last < 60 ) {
+            throw "findcommonpoint waiting a while before trying again";
+        }
+        last = time(0);
+
+        assert( d.dbMutex.atLeastReadLocked() );
+        Client::Context c(rsoplog);
+        NamespaceDetails *nsd = nsdetails(rsoplog);
+        assert(nsd);
+        ReverseCappedCursor u(nsd);
+        if( !u.ok() )
+            throw "our oplog empty or unreadable";
+
+        const Query q = Query().sort(reverseNaturalObj);
+        const bo fields = BSON( "ts" << 1 << "h" << 1 );
+
+        //auto_ptr<DBClientCursor> u = us->query(rsoplog, q, 0, 0, &fields, 0, 0);
+
+        h.rbid = getRBID(them);
+        auto_ptr<DBClientCursor> t = them->query(rsoplog, q, 0, 0, &fields, 0, 0);
+
+        if( t.get() == 0 || !t->more() ) throw "remote oplog empty or unreadable";
+
+        BSONObj ourObj = u.current();
+        OpTime ourTime = ourObj["ts"]._opTime();
+        BSONObj theirObj = t->nextSafe();
+        OpTime theirTime = theirObj["ts"]._opTime();
+
+        {
+            long long diff = (long long) ourTime.getSecs() - ((long long) theirTime.getSecs());
+            /* diff could be positive, negative, or zero */
+            log() << "replSet info rollback our last optime:   " << ourTime.toStringPretty() << rsLog;
+            log() << "replSet info rollback their last optime: " << theirTime.toStringPretty() << rsLog;
+            log() << "replSet info rollback diff in end of log times: " << diff << " seconds" << rsLog;
+            if( diff > 1800 ) {
+                log() << "replSet rollback too long a time period for a rollback." << rsLog;
+                throw "error not willing to roll back more than 30 minutes of data";
+            }
+        }
+
+        unsigned long long scanned = 0;
+        while( 1 ) {
+            scanned++;
+            /* todo add code to assure no excessive scanning for too long */
+            if( ourTime == theirTime ) {
+                if( ourObj["h"].Long() == theirObj["h"].Long() ) {
+                    // found the point back in time where we match.
+                    // todo : check a few more just to be careful about hash collisions.
+                    log() << "replSet rollback found matching events at " << ourTime.toStringPretty() << rsLog;
+                    log() << "replSet rollback findcommonpoint scanned : " << scanned << rsLog;
+                    h.commonPoint = ourTime;
+                    h.commonPointOurDiskloc = u.currLoc();
+                    return;
+                }
+
+                refetch(h, ourObj);
+
+                if( !t->more() ) {
+                    log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog;
+                    log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
+                    log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
+                    log() << "replSet   ourTime:   " << ourTime.toStringLong() << rsLog;
+                    throw "RS100 reached beginning of remote oplog [2]";
+                }
+                theirObj = t->nextSafe();
+                theirTime = theirObj["ts"]._opTime();
+
+                u.advance();
+                if( !u.ok() ) {
+                    log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog;
+                    log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
+                    log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
+                    log() << "replSet   ourTime:   " << ourTime.toStringLong() << rsLog;
+                    throw "RS101 reached beginning of local oplog [1]";
+                }
+                ourObj = u.current();
+                ourTime = ourObj["ts"]._opTime();
+            }
+            else if( theirTime > ourTime ) {
+                if( !t->more() ) {
+                    log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog;
+                    log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
+                    log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
+                    log() << "replSet   ourTime:   " << ourTime.toStringLong() << rsLog;
+                    throw "RS100 reached beginning of remote oplog [1]";
+                }
+                theirObj = t->nextSafe();
+                theirTime = theirObj["ts"]._opTime();
+            }
+            else {
+                // theirTime < ourTime
+                refetch(h, ourObj);
+                u.advance();
+                if( !u.ok() ) {
+                    log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog;
+                    log() << "replSet   them:      " << them->toString() << " scanned: " << scanned << rsLog;
+                    log() << "replSet   theirTime: " << theirTime.toStringLong() << rsLog;
+                    log() << "replSet   ourTime:   " << ourTime.toStringLong() << rsLog;
+                    throw "RS101 reached beginning of local oplog [2]";
+                }
+                ourObj = u.current();
+                ourTime = ourObj["ts"]._opTime();
+            }
+        }
+    }
+
+    struct X {
+        const bson::bo *op;
+        bson::bo goodVersionOfObject;
+    };
+
+    static void setMinValid(bo newMinValid) {
+        try {
+            log() << "replSet minvalid=" << newMinValid["ts"]._opTime().toStringLong() << rsLog;
+        }
+        catch(...) { }
+        {
+            Helpers::putSingleton("local.replset.minvalid", newMinValid);
+            Client::Context cx( "local." );
+            cx.db()->flushFiles(true);
+        }
+    }
+
+    void ReplSetImpl::syncFixUp(HowToFixUp& h, OplogReader& r) {
+        DBClientConnection *them = r.conn();
+
+        // fetch all first so we needn't handle interruption in a fancy way
+
+        unsigned long long totSize = 0;
+
+        list< pair<DocID,bo> > goodVersions;
+
+        bo newMinValid;
+
+        /* fetch all the goodVersions of each document from current primary */
+        DocID d;
+        unsigned long long n = 0;
+        try {
+            for( set<DocID>::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) {
+                d = *i;
+
+                assert( !d._id.eoo() );
+
+                {
+                    /* TODO : slow.  lots of round trips. */
+                    n++;
+                    bo good= them->findOne(d.ns, d._id.wrap(), NULL, QueryOption_SlaveOk).getOwned();
+                    totSize += good.objsize();
+                    uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 );
+
+                    // note good might be eoo, indicating we should delete it
+                    goodVersions.push_back(pair<DocID,bo>(d,good));
+                }
+            }
+            newMinValid = r.getLastOp(rsoplog);
+            if( newMinValid.isEmpty() ) {
+                sethbmsg("rollback error newMinValid empty?");
+                return;
+            }
+        }
+        catch(DBException& e) {
+            sethbmsg(str::stream() << "rollback re-get objects: " << e.toString(),0);
+            log() << "rollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog;
+            throw e;
+        }
+
+        MemoryMappedFile::flushAll(true);
+
+        sethbmsg("rollback 3.5");
+        if( h.rbid != getRBID(r.conn()) ) {
+            // our source rolled back itself.  so the data we received isn't necessarily consistent.
+            sethbmsg("rollback rbid on source changed during rollback, cancelling this attempt");
+            return;
+        }
+
+        // update them
+        sethbmsg(str::stream() << "rollback 4 n:" << goodVersions.size());
+
+        bool warn = false;
+
+        assert( !h.commonPointOurDiskloc.isNull() );
+
+        mongo::d.dbMutex.assertWriteLocked();
+
+        /* we have items we are writing that aren't from a point-in-time.  thus best not to come online
+           until we get to that point in freshness. */
+        setMinValid(newMinValid);
+
+        /** any full collection resyncs required? */
+        if( !h.collectionsToResync.empty() ) {
+            for( set<string>::iterator i = h.collectionsToResync.begin(); i != h.collectionsToResync.end(); i++ ) {
+                string ns = *i;
+                sethbmsg(str::stream() << "rollback 4.1 coll resync " << ns);
+
+                Client::Context c(ns);
+                {
+                    bob res;
+                    string errmsg;
+                    dropCollection(ns, errmsg, res);
+                    {
+                        dbtemprelease r;
+                        bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, errmsg);
+                        uassert(15909, str::stream() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg, ok);
+                    }
+                }
+            }
+
+            /* we did more reading from primary, so check it again for a rollback (which would mess us up), and
+               make minValid newer.
+               */
+            sethbmsg("rollback 4.2");
+            {
+                string err;
+                try {
+                    newMinValid = r.getLastOp(rsoplog);
+                    if( newMinValid.isEmpty() ) {
+                        err = "can't get minvalid from primary";
+                    }
+                    else {
+                        setMinValid(newMinValid);
+                    }
+                }
+                catch (DBException&) {
+                    err = "can't get/set minvalid";
+                }
+                if( h.rbid != getRBID(r.conn()) ) {
+                    // our source rolled back itself.  so the data we received isn't necessarily consistent.
+                    // however, we've now done writes.  thus we have a problem.
+                    err += "rbid at primary changed during resync/rollback";
+                }
+                if( !err.empty() ) {
+                    log() << "replSet error rolling back : " << err << ". A full resync will be necessary." << rsLog;
+                    /* todo: reset minvalid so that we are permanently in fatal state */
+                    /* todo: don't be fatal, but rather, get all the data first. */
+                    sethbmsg("rollback error");
+                    throw rsfatal();
+                }
+            }
+            sethbmsg("rollback 4.3");
+        }
+
+        sethbmsg("rollback 4.6");
+        /** drop collections to drop before doing individual fixups - that might make things faster below actually if there were subsequent inserts to rollback */
+        for( set<string>::iterator i = h.toDrop.begin(); i != h.toDrop.end(); i++ ) {
+            Client::Context c(*i);
+            try {
+                bob res;
+                string errmsg;
+                log(1) << "replSet rollback drop: " << *i << rsLog;
+                dropCollection(*i, errmsg, res);
+            }
+            catch(...) {
+                log() << "replset rollback error dropping collection " << *i << rsLog;
+            }
+        }
+
+        sethbmsg("rollback 4.7");
+        Client::Context c(rsoplog);
+        NamespaceDetails *oplogDetails = nsdetails(rsoplog);
+        uassert(13423, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails);
+
+        map<string,shared_ptr<RemoveSaver> > removeSavers;
+
+        unsigned deletes = 0, updates = 0;
+        for( list<pair<DocID,bo> >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) {
+            const DocID& d = i->first;
+            bo pattern = d._id.wrap(); // { _id : ... }
+            try {
+                assert( d.ns && *d.ns );
+                if( h.collectionsToResync.count(d.ns) ) {
+                    /* we just synced this entire collection */
+                    continue;
+                }
+
+                getDur().commitIfNeeded();
+
+                /* keep an archive of items rolled back */
+                shared_ptr<RemoveSaver>& rs = removeSavers[d.ns];
+                if ( ! rs )
+                    rs.reset( new RemoveSaver( "rollback" , "" , d.ns ) );
+
+                // todo: lots of overhead in context, this can be faster
+                Client::Context c(d.ns);
+                if( i->second.isEmpty() ) {
+                    // wasn't on the primary; delete.
+                    /* TODO1.6 : can't delete from a capped collection.  need to handle that here. */
+                    deletes++;
+
+                    NamespaceDetails *nsd = nsdetails(d.ns);
+                    if( nsd ) {
+                        if( nsd->capped ) {
+                            /* can't delete from a capped collection - so we truncate instead. if this item must go,
+                            so must all successors!!! */
+                            try {
+                                /** todo: IIRC cappedTrunateAfter does not handle completely empty.  todo. */
+                                // this will crazy slow if no _id index.
+                                long long start = Listener::getElapsedTimeMillis();
+                                DiskLoc loc = Helpers::findOne(d.ns, pattern, false);
+                                if( Listener::getElapsedTimeMillis() - start > 200 )
+                                    log() << "replSet warning roll back slow no _id index for " << d.ns << " perhaps?" << rsLog;
+                                //would be faster but requires index: DiskLoc loc = Helpers::findById(nsd, pattern);
+                                if( !loc.isNull() ) {
+                                    try {
+                                        nsd->cappedTruncateAfter(d.ns, loc, true);
+                                    }
+                                    catch(DBException& e) {
+                                        if( e.getCode() == 13415 ) {
+                                            // hack: need to just make cappedTruncate do this...
+                                            nsd->emptyCappedCollection(d.ns);
+                                        }
+                                        else {
+                                            throw;
+                                        }
+                                    }
+                                }
+                            }
+                            catch(DBException& e) {
+                                log() << "replSet error rolling back capped collection rec " << d.ns << ' ' << e.toString() << rsLog;
+                            }
+                        }
+                        else {
+                            try {
+                                deletes++;
+                                deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true, rs.get() );
+                            }
+                            catch(...) {
+                                log() << "replSet error rollback delete failed ns:" << d.ns << rsLog;
+                            }
+                        }
+                        // did we just empty the collection?  if so let's check if it even exists on the source.
+                        if( nsd->stats.nrecords == 0 ) {
+                            try {
+                                string sys = cc().database()->name + ".system.namespaces";
+                                bo o = them->findOne(sys, QUERY("name"<<d.ns));
+                                if( o.isEmpty() ) {
+                                    // we should drop
+                                    try {
+                                        bob res;
+                                        string errmsg;
+                                        dropCollection(d.ns, errmsg, res);
+                                    }
+                                    catch(...) {
+                                        log() << "replset error rolling back collection " << d.ns << rsLog;
+                                    }
+                                }
+                            }
+                            catch(DBException& ) {
+                                /* this isn't *that* big a deal, but is bad. */
+                                log() << "replSet warning rollback error querying for existence of " << d.ns << " at the primary, ignoring" << rsLog;
+                            }
+                        }
+                    }
+                }
+                else {
+                    // todo faster...
+                    OpDebug debug;
+                    updates++;
+                    _updateObjects(/*god*/true, d.ns, i->second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug, rs.get() );
+                }
+            }
+            catch(DBException& e) {
+                log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << " ndeletes:" << deletes << rsLog;
+                warn = true;
+            }
+        }
+
+        removeSavers.clear(); // this effectively closes all of them
+
+        sethbmsg(str::stream() << "rollback 5 d:" << deletes << " u:" << updates);
+        MemoryMappedFile::flushAll(true);
+        sethbmsg("rollback 6");
+
+        // clean up oplog
+        LOG(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog;
+        // todo: fatal error if this throws?
+        oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false);
+
+        /* reset cached lastoptimewritten and h value */
+        loadLastOpTimeWritten();
+
+        sethbmsg("rollback 7");
+        MemoryMappedFile::flushAll(true);
+
+        // done
+        if( warn )
+            sethbmsg("issues during syncRollback, see log");
+        else
+            sethbmsg("rollback done");
+    }
+
+    void ReplSetImpl::syncRollback(OplogReader&r) {
+        unsigned s = _syncRollback(r);
+        if( s )
+            sleepsecs(s);
+    }
+
+    unsigned ReplSetImpl::_syncRollback(OplogReader&r) {
+        assert( !lockedByMe() );
+        assert( !d.dbMutex.atLeastReadLocked() );
+
+        sethbmsg("rollback 0");
+
+        writelocktry lk(rsoplog, 20000);
+        if( !lk.got() ) {
+            sethbmsg("rollback couldn't get write lock in a reasonable time");
+            return 2;
+        }
+
+        if( state().secondary() ) {
+            /* by doing this, we will not service reads (return an error as we aren't in secondary staate.
+               that perhaps is moot becasue of the write lock above, but that write lock probably gets deferred
+               or removed or yielded later anyway.
+
+               also, this is better for status reporting - we know what is happening.
+               */
+            changeState(MemberState::RS_ROLLBACK);
+        }
+
+        HowToFixUp how;
+        sethbmsg("rollback 1");
+        {
+            r.resetCursor();
+
+            sethbmsg("rollback 2 FindCommonPoint");
+            try {
+                syncRollbackFindCommonPoint(r.conn(), how);
+            }
+            catch( const char *p ) {
+                sethbmsg(string("rollback 2 error ") + p);
+                return 10;
+            }
+            catch( rsfatal& ) {
+                _fatal();
+                return 2;
+            }
+            catch( DBException& e ) {
+                sethbmsg(string("rollback 2 exception ") + e.toString() + "; sleeping 1 min");
+                dbtemprelease r;
+                sleepsecs(60);
+                throw;
+            }
+        }
+
+        sethbmsg("replSet rollback 3 fixup");
+
+        {
+            incRBID();
+            try {
+                syncFixUp(how, r);
+            }
+            catch( rsfatal& ) {
+                sethbmsg("rollback fixup error");
+                _fatal();
+                return 2;
+            }
+            catch(...) {
+                incRBID(); throw;
+            }
+            incRBID();
+
+            /* success - leave "ROLLBACK" state
+               can go to SECONDARY once minvalid is achieved
+            */
+            changeState(MemberState::RS_RECOVERING);
+        }
+
+        return 0;
+    }
+
+}
diff --git a/src/mongo/db/repl/rs_sync.cpp b/src/mongo/db/repl/rs_sync.cpp
new file mode 100644
index 00000000000..8bac981d951
--- /dev/null
+++ b/src/mongo/db/repl/rs_sync.cpp
@@ -0,0 +1,701 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../client.h"
+#include "../../client/dbclient.h"
+#include "rs.h"
+#include "../repl.h"
+#include "connections.h"
+
+namespace mongo {
+
+    using namespace bson;
+    extern unsigned replSetForceInitialSyncFailure;
+
+    void NOINLINE_DECL blank(const BSONObj& o) {
+        if( *o.getStringField("op") != 'n' ) {
+            log() << "replSet skipping bad op in oplog: " << o.toString() << rsLog;
+        }
+    }
+
+    /* apply the log op that is in param o
+       @return bool success (true) or failure (false)
+    */
+    bool replset::SyncTail::syncApply(const BSONObj &o) {
+        const char *ns = o.getStringField("ns");
+        if ( *ns == '.' || *ns == 0 ) {
+            blank(o);
+            return true;
+        }
+
+        Client::Context ctx(ns);
+        ctx.getClient()->curop()->reset();
+        return !applyOperation_inlock(o);
+    }
+
+    /* initial oplog application, during initial sync, after cloning.
+       @return false on failure.
+       this method returns an error and doesn't throw exceptions (i think).
+    */
+    bool ReplSetImpl::initialSyncOplogApplication(const OpTime& applyGTE, const OpTime& minValid) {
+        Member *source = 0;
+        OplogReader r;
+
+        // keep trying to initial sync from oplog until we run out of targets
+        while ((source = _getOplogReader(r, applyGTE)) != 0) {
+            replset::InitialSync init(source->fullName());
+            if (init.oplogApplication(r, source, applyGTE, minValid)) {
+                return true;
+            }
+
+            r.resetConnection();
+            veto(source->fullName(), 60);
+            log() << "replSet applying oplog from " << source->fullName() << " failed, trying again" << endl;
+        }
+
+        log() << "replSet initial sync error: couldn't find oplog to sync from" << rsLog;
+        return false;
+    }
+
+    bool replset::InitialSync::oplogApplication(OplogReader& r, const Member* source,
+        const OpTime& applyGTE, const OpTime& minValid) {
+
+        const string hn = source->fullName();
+        try {
+            r.tailingQueryGTE( rsoplog, applyGTE );
+            if ( !r.haveCursor() ) {
+                log() << "replSet initial sync oplog query error" << rsLog;
+                return false;
+            }
+
+            {
+                if( !r.more() ) {
+                    sethbmsg("replSet initial sync error reading remote oplog");
+                    log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
+                    return false;
+                }
+                bo op = r.next();
+                OpTime t = op["ts"]._opTime();
+                r.putBack(op);
+
+                if( op.firstElementFieldName() == string("$err") ) {
+                    log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
+                    return false;
+                }
+
+                uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
+                if( t > applyGTE ) {
+                    sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
+                    log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
+                    log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
+                    return false;
+                }
+
+                sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at "
+                         << t.toStringPretty() << " to " << minValid.toStringPretty());
+            }
+        }
+        catch(DBException& e) {
+            log() << "replSet initial sync failing: " << e.toString() << rsLog;
+            return false;
+        }
+
+        /* we lock outside the loop to avoid the overhead of locking on every operation. */
+        writelock lk("");
+
+        // todo : use exhaust
+        OpTime ts;
+        time_t start = time(0);
+        unsigned long long n = 0;
+        int fails = 0;
+        while( ts < minValid ) {
+            try {
+                // There are some special cases with initial sync (see the catch block), so we
+                // don't want to break out of this while until we've reached minvalid. Thus, we'll
+                // keep trying to requery.
+                if( !r.more() ) {
+                    OCCASIONALLY log() << "replSet initial sync oplog: no more records" << endl;
+                    sleepsecs(1);
+
+                    r.resetCursor();
+                    r.tailingQueryGTE(rsoplog, theReplSet->lastOpTimeWritten);
+                    if ( !r.haveCursor() ) {
+                        if (fails++ > 30) {
+                            log() << "replSet initial sync tried to query oplog 30 times, giving up" << endl;
+                            return false;
+                        }
+                    }
+
+                    continue;
+                }
+
+                BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
+                ts = o["ts"]._opTime();
+
+                {
+                    if( (source->state() != MemberState::RS_PRIMARY &&
+                            source->state() != MemberState::RS_SECONDARY) ||
+                            replSetForceInitialSyncFailure ) {
+
+                        int f = replSetForceInitialSyncFailure;
+                        if( f > 0 ) {
+                            replSetForceInitialSyncFailure = f-1;
+                            log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
+                            throw DBException("forced error",0);
+                        }
+                        log() << "replSet we are now primary" << rsLog;
+                        throw DBException("primary changed",0);
+                    }
+
+                    applyOp(o, applyGTE);
+                }
+
+                if ( ++n % 1000 == 0 ) {
+                    time_t now = time(0);
+                    if (now - start > 10) {
+                        // simple progress metering
+                        log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to "
+                              << ts.toStringPretty() << rsLog;
+                        start = now;
+                    }
+                }
+
+                getDur().commitIfNeeded();
+            }
+            catch (DBException& e) {
+                // Skip duplicate key exceptions.
+                // These are relatively common on initial sync: if a document is inserted
+                // early in the clone step, the insert will be replayed but the document
+                // will probably already have been cloned over.
+                if( e.getCode() == 11000 || e.getCode() == 11001 || e.getCode() == 12582) {
+                    continue;
+                }
+                
+                // handle cursor not found (just requery)
+                if( e.getCode() == 13127 ) {
+                    log() << "replSet requerying oplog after cursor not found condition, ts: " << ts.toStringPretty() << endl;
+                    r.resetCursor();
+                    r.tailingQueryGTE(rsoplog, ts);
+                    if( r.haveCursor() ) {
+                        continue;
+                    }
+                }
+
+                // TODO: handle server restart
+
+                if( ts <= minValid ) {
+                    // didn't make it far enough
+                    log() << "replSet initial sync failing, error applying oplog : " << e.toString() << rsLog;
+                    return false;
+                }
+
+                // otherwise, whatever, we'll break out of the loop and catch
+                // anything that's really wrong in syncTail
+            }
+        }
+        return true;
+    }
+
+    void replset::InitialSync::applyOp(const BSONObj& o, const OpTime& applyGTE) {
+        OpTime ts = o["ts"]._opTime();
+
+        // optimes before we started copying need not be applied.
+        if( ts >= applyGTE ) {
+            if (!syncApply(o)) {
+                if (shouldRetry(o)) {
+                    uassert(15915, "replSet update still fails after adding missing object", syncApply(o));
+                }
+            }
+        }
+
+        // with repl sets we write the ops to our oplog, too
+        _logOpObjRS(o);
+    }
+
+    /* should be in RECOVERING state on arrival here.
+       readlocks
+       @return true if transitioned to SECONDARY
+    */
+    bool ReplSetImpl::tryToGoLiveAsASecondary(OpTime& /*out*/ minvalid) {
+        bool golive = false;
+
+        {
+            lock lk( this );
+
+            if (_maintenanceMode > 0) {
+                // we're not actually going live
+                return true;
+            }
+        }
+
+        {
+            readlock lk("local.replset.minvalid");
+            BSONObj mv;
+            if( Helpers::getSingleton("local.replset.minvalid", mv) ) {
+                minvalid = mv["ts"]._opTime();
+                if( minvalid <= lastOpTimeWritten ) {
+                    golive=true;
+                }
+            }
+            else
+                golive = true; /* must have been the original member */
+        }
+        if( golive ) {
+            sethbmsg("");
+            changeState(MemberState::RS_SECONDARY);
+        }
+        return golive;
+    }
+
+    bool ReplSetImpl::_isStale(OplogReader& r, const OpTime& startTs, BSONObj& remoteOldestOp) {
+        remoteOldestOp = r.findOne(rsoplog, Query());
+        OpTime remoteTs = remoteOldestOp["ts"]._opTime();
+        DEV log() << "replSet remoteOldestOp:    " << remoteTs.toStringLong() << rsLog;
+        else LOG(3) << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog;
+        DEV {
+            log() << "replSet lastOpTimeWritten: " << lastOpTimeWritten.toStringLong() << rsLog;
+            log() << "replSet our state: " << state().toString() << rsLog;
+        }
+        if( startTs >= remoteTs ) {
+            return false;
+        }
+
+        return true;
+    }
+
+    Member* ReplSetImpl::_getOplogReader(OplogReader& r, const OpTime& minTS) {
+        Member *target = 0, *stale = 0;
+        BSONObj oldest;
+
+        assert(r.conn() == 0);
+
+        while ((target = getMemberToSyncTo()) != 0) {
+            string current = target->fullName();
+
+            if( !r.connect(current) ) {
+                log(2) << "replSet can't connect to " << current << " to read operations" << rsLog;
+                r.resetConnection();
+                veto(current);
+                continue;
+            }
+
+            if( !minTS.isNull() && _isStale(r, minTS, oldest) ) {
+                r.resetConnection();
+                veto(current, 600);
+                stale = target;
+                continue;
+            }
+
+            // if we made it here, the target is up and not stale
+            return target;
+        }
+
+        // the only viable sync target was stale
+        if (stale) {
+            log() << "replSet error RS102 too stale to catch up, at least from " << stale->fullName() << rsLog;
+            log() << "replSet our last optime : " << lastOpTimeWritten.toStringLong() << rsLog;
+            log() << "replSet oldest at " << stale->fullName() << " : " << oldest["ts"]._opTime().toStringLong() << rsLog;
+            log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
+
+            // reset minvalid so that we can't become primary prematurely
+            {
+                writelock lk("local.replset.minvalid");
+                Helpers::putSingleton("local.replset.minvalid", oldest);
+            }
+
+            sethbmsg("error RS102 too stale to catch up");
+            changeState(MemberState::RS_RECOVERING);
+            sleepsecs(120);
+        }
+
+        return 0;
+    }
+
+    /* tail an oplog.  ok to return, will be re-called. */
+    void ReplSetImpl::syncTail() {
+        // todo : locking vis a vis the mgr...
+        OplogReader r;
+        string hn;
+
+        // find a target to sync from the last op time written
+        Member* target = _getOplogReader(r, lastOpTimeWritten);
+
+        // no server found
+        if (target == 0) {
+            // if there is no one to sync from
+            OpTime minvalid;
+            tryToGoLiveAsASecondary(minvalid);
+            return;
+        }
+        
+        r.tailingQueryGTE(rsoplog, lastOpTimeWritten);
+        // if target cut connections between connecting and querying (for
+        // example, because it stepped down) we might not have a cursor
+        if ( !r.haveCursor() ) {
+            return;
+        }
+
+        uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );
+
+        {
+            if( !r.more() ) {
+                /* maybe we are ahead and need to roll back? */
+                try {
+                    bo theirLastOp = r.getLastOp(rsoplog);
+                    if( theirLastOp.isEmpty() ) {
+                        log() << "replSet error empty query result from " << hn << " oplog" << rsLog;
+                        sleepsecs(2);
+                        return;
+                    }
+                    OpTime theirTS = theirLastOp["ts"]._opTime();
+                    if( theirTS < lastOpTimeWritten ) {
+                        log() << "replSet we are ahead of the primary, will try to roll back" << rsLog;
+                        syncRollback(r);
+                        return;
+                    }
+                    /* we're not ahead?  maybe our new query got fresher data.  best to come back and try again */
+                    log() << "replSet syncTail condition 1" << rsLog;
+                    sleepsecs(1);
+                }
+                catch(DBException& e) {
+                    log() << "replSet error querying " << hn << ' ' << e.toString() << rsLog;
+                    veto(target->fullName());
+                    sleepsecs(2);
+                }
+                return;
+            }
+
+            BSONObj o = r.nextSafe();
+            OpTime ts = o["ts"]._opTime();
+            long long h = o["h"].numberLong();
+            if( ts != lastOpTimeWritten || h != lastH ) {
+                log() << "replSet our last op time written: " << lastOpTimeWritten.toStringPretty() << rsLog;
+                log() << "replset source's GTE: " << ts.toStringPretty() << rsLog;
+                syncRollback(r);
+                return;
+            }
+        }
+
+        /* we have now checked if we need to rollback and we either don't have to or did it. */
+        {
+            OpTime minvalid;
+            tryToGoLiveAsASecondary(minvalid);
+        }
+
+        while( 1 ) {
+            {
+                Timer timeInWriteLock;
+                writelock lk("");
+                while( 1 ) {
+                    if( !r.moreInCurrentBatch() ) {
+                        dbtemprelease tempRelease;
+                        {
+                            // we need to occasionally check some things. between
+                            // batches is probably a good time.                            
+                            if( state().recovering() ) { // perhaps we should check this earlier? but not before the rollback checks.
+                                /* can we go to RS_SECONDARY state?  we can if not too old and if minvalid achieved */
+                                OpTime minvalid;
+                                bool golive = ReplSetImpl::tryToGoLiveAsASecondary(minvalid);
+                                if( golive ) {
+                                    ;
+                                }
+                                else {
+                                    sethbmsg(str::stream() << "still syncing, not yet to minValid optime" << minvalid.toString());
+                                }
+                                // todo: too stale capability
+                            }
+                            if( !target->hbinfo().hbstate.readable() ) {
+                                return;
+                            }
+                        }
+                        r.more(); // to make the requestmore outside the db lock, which obviously is quite important
+                    }
+                    if( timeInWriteLock.micros() > 1000 ) {
+                        dbtemprelease tempRelease;
+                        timeInWriteLock.reset();
+                    }
+                    if( !r.more() )
+                        break;
+                    {
+                        BSONObj o = r.nextSafe(); // note we might get "not master" at some point
+
+                        int sd = myConfig().slaveDelay;
+                        // ignore slaveDelay if the box is still initializing. once
+                        // it becomes secondary we can worry about it.
+                        if( sd && box.getState().secondary() ) {
+                            const OpTime ts = o["ts"]._opTime();
+                            long long a = ts.getSecs();
+                            long long b = time(0);
+                            long long lag = b - a;
+                            long long sleeptime = sd - lag;
+                            if( sleeptime > 0 ) {
+                                dbtemprelease tempRelease;
+                                uassert(12000, "rs slaveDelay differential too big check clocks and systems", sleeptime < 0x40000000);
+                                if( sleeptime < 60 ) {
+                                    sleepsecs((int) sleeptime);
+                                }
+                                else {
+                                    log() << "replSet slavedelay sleep long time: " << sleeptime << rsLog;
+                                    // sleep(hours) would prevent reconfigs from taking effect & such!
+                                    long long waitUntil = b + sleeptime;
+                                    while( 1 ) {
+                                        sleepsecs(6);
+                                        if( time(0) >= waitUntil )
+                                            break;
+
+                                        if( !target->hbinfo().hbstate.readable() ) {
+                                            break;
+                                        }
+                                    
+                                        if( myConfig().slaveDelay != sd ) // reconf
+                                            break;
+                                    }
+                                }
+                            }
+                        } // endif slaveDelay
+
+                        d.dbMutex.assertWriteLocked();
+                        try {
+                            /* if we have become primary, we dont' want to apply things from elsewhere
+                               anymore. assumePrimary is in the db lock so we are safe as long as
+                               we check after we locked above. */
+                            if( box.getState().primary() ) {
+                                log(0) << "replSet stopping syncTail we are now primary" << rsLog;
+                                return;
+                            }
+
+                            // TODO: make this whole method a member of SyncTail (SERVER-4444)
+                            replset::SyncTail tail("");
+                            tail.syncApply(o);
+                            _logOpObjRS(o);   // with repl sets we write the ops to our oplog too
+                        }
+                        catch (DBException& e) {
+                            sethbmsg(str::stream() << "syncTail: " << e.toString() << ", syncing: " << o);
+                            veto(target->fullName(), 300);
+                            sleepsecs(30);
+                            return;
+                        }
+                    }
+                } // end while
+            } // end writelock scope
+
+            r.tailCheck();
+            if( !r.haveCursor() ) {
+                LOG(1) << "replSet end syncTail pass with " << hn << rsLog;
+                // TODO : reuse our connection to the primary.
+                return;
+            }
+            
+            if( !target->hbinfo().hbstate.readable() ) {
+                return;
+            }
+            // looping back is ok because this is a tailable cursor
+        }
+    }
+
+    void ReplSetImpl::_syncThread() {
+        StateBox::SP sp = box.get();
+        if( sp.state.primary() ) {
+            sleepsecs(1);
+            return;
+        }
+        if( _blockSync || sp.state.fatal() || sp.state.startup() ) {
+            sleepsecs(5);
+            return;
+        }
+
+        /* do we have anything at all? */
+        if( lastOpTimeWritten.isNull() ) {
+            syncDoInitialSync();
+            return; // _syncThread will be recalled, starts from top again in case sync failed.
+        }
+
+        /* we have some data.  continue tailing. */
+        syncTail();
+    }
+
+    void ReplSetImpl::syncThread() {
+        while( 1 ) {
+            // After a reconfig, we may not be in the replica set anymore, so
+            // check that we are in the set (and not an arbiter) before
+            // trying to sync with other replicas.
+            if( ! _self ) {
+            	log() << "replSet warning did not detect own host and port, not syncing, config: " << theReplSet->config() << rsLog;
+                return;
+            }
+            if( myConfig().arbiterOnly ) {
+                return;
+            }
+
+            try {
+                _syncThread();
+            }
+            catch(DBException& e) {
+                sethbmsg(str::stream() << "syncThread: " << e.toString());
+                sleepsecs(10);
+            }
+            catch(...) {
+                sethbmsg("unexpected exception in syncThread()");
+                // TODO : SET NOT SECONDARY here?
+                sleepsecs(60);
+            }
+            sleepsecs(1);
+
+            /* normally msgCheckNewState gets called periodically, but in a single node repl set there
+               are no heartbeat threads, so we do it here to be sure.  this is relevant if the singleton
+               member has done a stepDown() and needs to come back up.
+               */
+            OCCASIONALLY {
+            	mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
+            }
+        }
+    }
+
+    void startSyncThread() {
+        static int n;
+        if( n != 0 ) {
+            log() << "replSet ERROR : more than one sync thread?" << rsLog;
+            assert( n == 0 );
+        }
+        n++;
+
+        Client::initThread("rsSync");
+        cc().iAmSyncThread(); // for isSyncThread() (which is used not used much, is used in secondary create index code
+        replLocalAuth();
+        theReplSet->syncThread();
+        cc().shutdown();
+    }
+
+    void GhostSync::starting() {
+        Client::initThread("rsGhostSync");
+        replLocalAuth();
+    }
+
+    void ReplSetImpl::blockSync(bool block) {
+        _blockSync = block;
+        if (_blockSync) {
+            // syncing is how we get into SECONDARY state, so we'll be stuck in
+            // RECOVERING until we unblock
+            changeState(MemberState::RS_RECOVERING);
+        }
+    }
+
+    void GhostSync::associateSlave(const BSONObj& id, const int memberId) {
+        const OID rid = id["_id"].OID();
+        rwlock lk( _lock , true );
+        shared_ptr<GhostSlave> &g = _ghostCache[rid];
+        if( g.get() == 0 ) {
+            g.reset( new GhostSlave() );
+            wassert( _ghostCache.size() < 10000 );
+        }
+        GhostSlave &slave = *g;
+        if (slave.init) {
+            LOG(1) << "tracking " << slave.slave->h().toString() << " as " << rid << rsLog;
+            return;
+        }
+
+        slave.slave = (Member*)rs->findById(memberId);
+        if (slave.slave != 0) {
+            slave.init = true;
+        }
+        else {
+            log() << "replset couldn't find a slave with id " << memberId
+                  << ", not tracking " << rid << rsLog;
+        }
+    }
+
+    void GhostSync::updateSlave(const mongo::OID& rid, const OpTime& last) {
+        rwlock lk( _lock , false );
+        MAP::iterator i = _ghostCache.find( rid );
+        if ( i == _ghostCache.end() ) {
+            OCCASIONALLY warning() << "couldn't update slave " << rid << " no entry" << rsLog;
+            return;
+        }
+        
+        GhostSlave& slave = *(i->second);
+        if (!slave.init) {
+            OCCASIONALLY log() << "couldn't update slave " << rid << " not init" << rsLog;            
+            return;
+        }
+
+        ((ReplSetConfig::MemberCfg)slave.slave->config()).updateGroups(last);
+    }
+
+    void GhostSync::percolate(const BSONObj& id, const OpTime& last) {
+        const OID rid = id["_id"].OID();
+        GhostSlave* slave;
+        {
+            rwlock lk( _lock , false );
+
+            MAP::iterator i = _ghostCache.find( rid );
+            if ( i == _ghostCache.end() ) {
+                OCCASIONALLY log() << "couldn't percolate slave " << rid << " no entry" << rsLog;
+                return;
+            }
+
+            slave = i->second.get();
+            if (!slave->init) {
+                OCCASIONALLY log() << "couldn't percolate slave " << rid << " not init" << rsLog;
+                return;
+            }
+        }
+
+        assert(slave->slave);
+
+        const Member *target = rs->_currentSyncTarget;
+        if (!target || rs->box.getState().primary()
+            // we are currently syncing from someone who's syncing from us
+            // the target might end up with a new Member, but s.slave never
+            // changes so we'll compare the names
+            || target == slave->slave || target->fullName() == slave->slave->fullName()) {
+            LOG(1) << "replica set ghost target no good" << endl;
+            return;
+        }
+
+        try {
+            if (!slave->reader.haveCursor()) {
+                if (!slave->reader.connect(id, slave->slave->id(), target->fullName())) {
+                    // error message logged in OplogReader::connect
+                    return;
+                }
+                slave->reader.ghostQueryGTE(rsoplog, last);
+            }
+
+            LOG(1) << "replSet last: " << slave->last.toString() << " to " << last.toString() << rsLog;
+            if (slave->last > last) {
+                return;
+            }
+
+            while (slave->last <= last) {
+                if (!slave->reader.more()) {
+                    // we'll be back
+                    return;
+                }
+
+                BSONObj o = slave->reader.nextSafe();
+                slave->last = o["ts"]._opTime();
+            }
+            LOG(2) << "now last is " << slave->last.toString() << rsLog;
+        }
+        catch (DBException& e) {
+            // we'll be back
+            LOG(2) << "replSet ghost sync error: " << e.what() << " for "
+                   << slave->slave->fullName() << rsLog;
+            slave->reader.resetConnection();
+        }
+    }
+}
diff --git a/src/mongo/db/repl/test.html b/src/mongo/db/repl/test.html
new file mode 100644
index 00000000000..295ad2ef0e0
--- /dev/null
+++ b/src/mongo/db/repl/test.html
@@ -0,0 +1,11 @@
+<HTML>
+<BODY>
+<!-- see also jstests/rs/ -->
+<iframe src="http://127.0.0.1:28000/_replSet" width="100%" height="50%" frameborder=1>
+</iframe>
+
+<iframe src="http://127.0.0.1:28001/_replSet" width="100%" height="50%" frameborder=1>
+</iframe>
+
+</BODY>
+</HTML>
diff --git a/src/mongo/db/repl/testing.js b/src/mongo/db/repl/testing.js
new file mode 100644
index 00000000000..d741cf3a644
--- /dev/null
+++ b/src/mongo/db/repl/testing.js
@@ -0,0 +1,42 @@
+// helpers for testing repl sets
+// run
+//   mongo --shell <host:port> testing.js
+
+cfg = {
+    _id: 'asdf',
+    members: [
+        { _id : 0, host : "dm_hp" },
+        { _id : 2, host : "dm_hp:27002" }
+        ]
+};
+c2 = {
+    _id: 'asdf',
+    members: [
+        { _id: 0, host: "dmthink" },
+        { _id: 2, host: "dmthink:27002" }
+        ]
+};
+
+db = db.getSisterDB("admin");
+local = db.getSisterDB("local");
+
+print("\n\ndb = admin db on localhost:27017");
+print("b = admin on localhost:27002");
+print("rc(x) = db.runCommand(x)");
+print("cfg = samp replset config");
+print("i() = replSetInitiate(cfg)");
+print("ism() = rc('ismaster')");
+print("\n\n");
+
+function rc(c) { return db.runCommand(c); }
+function i() { return rc({ replSetInitiate: cfg }); }
+function ism() { return rc("isMaster"); }
+
+b = 0;
+try {
+    b = new Mongo("localhost:27002").getDB("admin");
+}
+catch (e) {
+    print("\nCouldn't connect to b mongod instance\n");
+}
+
diff --git a/src/mongo/db/repl_block.cpp b/src/mongo/db/repl_block.cpp
new file mode 100644
index 00000000000..1776225505c
--- /dev/null
+++ b/src/mongo/db/repl_block.cpp
@@ -0,0 +1,256 @@
+// repl_block.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "repl.h"
+#include "repl_block.h"
+#include "instance.h"
+#include "dbhelpers.h"
+#include "../util/background.h"
+#include "../util/mongoutils/str.h"
+#include "../client/dbclient.h"
+#include "replutil.h"
+
+//#define REPLDEBUG(x) log() << "replBlock: "  << x << endl;
+#define REPLDEBUG(x)
+
+namespace mongo {
+
+    using namespace mongoutils;
+
+    class SlaveTracking : public BackgroundJob {
+    public:
+        string name() const { return "SlaveTracking"; }
+
+        static const char * NS;
+
+        struct Ident {
+
+            Ident(const BSONObj& r, const string& h, const string& n) {
+                BSONObjBuilder b;
+                b.appendElements( r );
+                b.append( "host" , h );
+                b.append( "ns" , n );
+                obj = b.obj();
+            }
+
+            bool operator<( const Ident& other ) const {
+                return obj["_id"].OID() < other.obj["_id"].OID();
+            }
+
+            BSONObj obj;
+        };
+
+        struct Info {
+            Info() : loc(0) {}
+            ~Info() {
+                if ( loc && owned ) {
+                    delete loc;
+                }
+            }
+            bool owned; // true if loc is a pointer of our creation (and not a pointer into a MMF)
+            OpTime * loc;
+        };
+
+        SlaveTracking() : _mutex("SlaveTracking") {
+            _dirty = false;
+            _started = false;
+        }
+
+        void run() {
+            Client::initThread( "slaveTracking" );
+            DBDirectClient db;
+            while ( ! inShutdown() ) {
+                sleepsecs( 1 );
+
+                if ( ! _dirty )
+                    continue;
+
+                writelock lk(NS);
+
+                list< pair<BSONObj,BSONObj> > todo;
+
+                {
+                    scoped_lock mylk(_mutex);
+
+                    for ( map<Ident,Info>::iterator i=_slaves.begin(); i!=_slaves.end(); i++ ) {
+                        BSONObjBuilder temp;
+                        temp.appendTimestamp( "syncedTo" , i->second.loc[0].asDate() );
+                        todo.push_back( pair<BSONObj,BSONObj>( i->first.obj.getOwned() ,
+                                                               BSON( "$set" << temp.obj() ).getOwned() ) );
+                    }
+                }
+
+                for ( list< pair<BSONObj,BSONObj> >::iterator i=todo.begin(); i!=todo.end(); i++ ) {
+                    db.update( NS , i->first , i->second , true );
+                }
+
+                _dirty = false;
+            }
+        }
+
+        void reset() {
+            scoped_lock mylk(_mutex);
+            _slaves.clear();
+        }
+
+        void update( const BSONObj& rid , const string& host , const string& ns , OpTime last ) {
+            REPLDEBUG( host << " " << rid << " " << ns << " " << last );
+
+            scoped_lock mylk(_mutex);
+
+#ifdef _DEBUG
+            MongoFileAllowWrites allowWrites;
+#endif
+
+            Ident ident(rid,host,ns);
+            Info& i = _slaves[ ident ];
+
+            if (theReplSet && theReplSet->isPrimary()) {
+                theReplSet->ghost->updateSlave(ident.obj["_id"].OID(), last);
+            }
+
+            if ( i.loc ) {
+                if( i.owned )
+                    i.loc[0] = last;
+                else
+                    getDur().setNoJournal(i.loc, &last, sizeof(last));
+                return;
+            }
+
+            d.dbMutex.assertAtLeastReadLocked();
+
+            BSONObj res;
+            if ( Helpers::findOne( NS , ident.obj , res ) ) {
+                assert( res["syncedTo"].type() );
+                i.owned = false;
+                i.loc = (OpTime*)res["syncedTo"].value();
+                getDur().setNoJournal(i.loc, &last, sizeof(last));
+                return;
+            }
+
+            i.owned = true;
+            i.loc = new OpTime(last);
+            _dirty = true;
+
+            if ( ! _started ) {
+                // start background thread here since we definitely need it
+                _started = true;
+                go();
+            }
+
+        }
+
+        bool opReplicatedEnough( OpTime op , BSONElement w ) {
+            RARELY {
+                REPLDEBUG( "looking for : " << op << " w=" << w );
+            }
+
+            if (w.isNumber()) {
+                return replicatedToNum(op, w.numberInt());
+            }
+
+            if (!theReplSet) {
+                return false;
+            }
+
+            string wStr = w.String();
+            if (wStr == "majority") {
+                // use the entire set, including arbiters, to prevent writing
+                // to a majority of the set but not a majority of voters
+                return replicatedToNum(op, theReplSet->config().getMajority());
+            }
+
+            map<string,ReplSetConfig::TagRule*>::const_iterator it = theReplSet->config().rules.find(wStr);
+            uassert(14830, str::stream() << "unrecognized getLastError mode: " << wStr,
+                    it != theReplSet->config().rules.end());
+
+            return op <= (*it).second->last;
+        }
+
+        bool replicatedToNum(OpTime& op, int w) {
+            if ( w <= 1 || ! _isMaster() )
+                return true;
+
+            w--; // now this is the # of slaves i need
+            scoped_lock mylk(_mutex);
+            for ( map<Ident,Info>::iterator i=_slaves.begin(); i!=_slaves.end(); i++) {
+                OpTime s = *(i->second.loc);
+                if ( s < op ) {
+                    continue;
+                }
+                if ( --w == 0 )
+                    return true;
+            }
+            return w <= 0;
+        }
+
+        unsigned getSlaveCount() const {
+            scoped_lock mylk(_mutex);
+
+            return _slaves.size();
+        }
+
+        // need to be careful not to deadlock with this
+        mutable mongo::mutex _mutex;
+        map<Ident,Info> _slaves;
+        bool _dirty;
+        bool _started;
+
+    } slaveTracking;
+
+    const char * SlaveTracking::NS = "local.slaves";
+
+    void updateSlaveLocation( CurOp& curop, const char * ns , OpTime lastOp ) {
+        if ( lastOp.isNull() )
+            return;
+
+        assert( str::startsWith(ns, "local.oplog.") );
+
+        Client * c = curop.getClient();
+        assert(c);
+        BSONObj rid = c->getRemoteID();
+        if ( rid.isEmpty() )
+            return;
+
+        slaveTracking.update( rid , curop.getRemoteString( false ) , ns , lastOp );
+
+        if (theReplSet && !theReplSet->isPrimary()) {
+            // we don't know the slave's port, so we make the replica set keep
+            // a map of rids to slaves
+            log(2) << "percolating " << lastOp.toString() << " from " << rid << endl;
+            theReplSet->ghost->send( boost::bind(&GhostSync::percolate, theReplSet->ghost, rid, lastOp) );
+        }
+    }
+
+    bool opReplicatedEnough( OpTime op , BSONElement w ) {
+        return slaveTracking.opReplicatedEnough( op , w );
+    }
+
+    bool opReplicatedEnough( OpTime op , int w ) {
+        return slaveTracking.replicatedToNum( op , w );
+    }
+
+    void resetSlaveCache() {
+        slaveTracking.reset();
+    }
+
+    unsigned getSlaveCount() {
+        return slaveTracking.getSlaveCount();
+    }
+}
diff --git a/src/mongo/db/repl_block.h b/src/mongo/db/repl_block.h
new file mode 100644
index 00000000000..bb74deea10f
--- /dev/null
+++ b/src/mongo/db/repl_block.h
@@ -0,0 +1,39 @@
+// repl_block.h - blocking on writes for replication
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../pch.h"
+#include "client.h"
+#include "curop.h"
+
+/**
+   local.slaves  - current location for all slaves
+
+ */
+namespace mongo {
+
+    void updateSlaveLocation( CurOp& curop, const char * oplog_ns , OpTime lastOp );
+
+    /** @return true if op has made it to w servers */
+    bool opReplicatedEnough( OpTime op , int w );
+    bool opReplicatedEnough( OpTime op , BSONElement w );
+
+    void resetSlaveCache();
+    unsigned getSlaveCount();
+}
diff --git a/src/mongo/db/replutil.h b/src/mongo/db/replutil.h
new file mode 100644
index 00000000000..6f4dbb875d2
--- /dev/null
+++ b/src/mongo/db/replutil.h
@@ -0,0 +1,102 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "db.h"
+#include "dbhelpers.h"
+#include "json.h"
+#include "../client/dbclient.h"
+#include "repl.h"
+#include "cmdline.h"
+#include "repl/rs.h"
+#include "ops/query.h"
+
+namespace mongo {
+
+    extern const char *replAllDead;
+
+    /* note we always return true for the "local" namespace.
+
+       we should not allow most operations when not the master
+       also we report not master if we are "dead".
+
+       See also CmdIsMaster.
+
+       If 'client' is not specified, the current client is used.
+    */
+    inline bool _isMaster() {
+        if( replSet ) {
+            if( theReplSet )
+                return theReplSet->isPrimary();
+            return false;
+        }
+
+        if( ! replSettings.slave )
+            return true;
+
+        if ( replAllDead )
+            return false;
+
+        if( replSettings.master ) {
+            // if running with --master --slave, allow.
+            return true;
+        }
+
+        if ( cc().isGod() )
+            return true;
+
+        return false;
+    }
+    inline bool isMaster(const char * dbname = 0) {
+        if( _isMaster() )
+            return true;
+        if ( ! dbname ) {
+            Database *database = cc().database();
+            assert( database );
+            dbname = database->name.c_str();
+        }
+        return strcmp( dbname , "local" ) == 0;
+    }
+    inline bool isMasterNs( const char *ns ) {
+        if ( _isMaster() )
+            return true;
+        assert( ns );
+        if ( ! str::startsWith( ns , "local" ) )
+            return false;
+        return ns[5] == 0 || ns[5] == '.';
+    }
+
+    inline void notMasterUnless(bool expr) {
+        uassert( 10107 , "not master" , expr );
+    }
+
+    /** we allow queries to SimpleSlave's */
+    inline void replVerifyReadsOk(ParsedQuery& pq) {
+        if( replSet ) {
+            /* todo: speed up the secondary case.  as written here there are 2 mutex entries, it can b 1. */
+            if( isMaster() ) return;
+            uassert(13435, "not master and slaveOk=false", pq.hasOption(QueryOption_SlaveOk));
+            uassert(13436, "not master or secondary; cannot currently read from this replSet member", theReplSet && theReplSet->isSecondary() );
+        }
+        else {
+            notMasterUnless(isMaster() || pq.hasOption(QueryOption_SlaveOk) || replSettings.slave == SimpleSlave );
+        }
+    }
+
+
+
+} // namespace mongo
diff --git a/src/mongo/db/resource.h b/src/mongo/db/resource.h
new file mode 100644
index 00000000000..9ba1ed26a0c
--- /dev/null
+++ b/src/mongo/db/resource.h
@@ -0,0 +1,16 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Visual C++ generated include file.
+// Used by db.rc
+//
+#define IDI_ICON2                       102
+
+// Next default values for new objects
+//
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        104
+#define _APS_NEXT_COMMAND_VALUE         40001
+#define _APS_NEXT_CONTROL_VALUE         1001
+#define _APS_NEXT_SYMED_VALUE           101
+#endif
+#endif
diff --git a/src/mongo/db/restapi.cpp b/src/mongo/db/restapi.cpp
new file mode 100644
index 00000000000..370051354a2
--- /dev/null
+++ b/src/mongo/db/restapi.cpp
@@ -0,0 +1,294 @@
+/** @file resetapi.cpp
+    web rest api
+*/
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "../util/net/miniwebserver.h"
+#include "../util/mongoutils/html.h"
+#include "../util/md5.hpp"
+#include "instance.h"
+#include "dbwebserver.h"
+#include "dbhelpers.h"
+#include "repl.h"
+#include "replutil.h"
+#include "clientcursor.h"
+#include "background.h"
+
+#include "restapi.h"
+
+namespace mongo {
+
+    extern const char *replInfo;
+    bool getInitialSyncCompleted();
+
+    using namespace bson;
+    using namespace mongoutils::html;
+
+    class RESTHandler : public DbWebHandler {
+    public:
+        RESTHandler() : DbWebHandler( "DUMMY REST" , 1000 , true ) {}
+
+        virtual bool handles( const string& url ) const {
+            return
+                url[0] == '/' &&
+                url.find_last_of( '/' ) > 0;
+        }
+
+        virtual void handle( const char *rq, string url, BSONObj params,
+                             string& responseMsg, int& responseCode,
+                             vector<string>& headers,  const SockAddr &from ) {
+
+            string::size_type first = url.find( "/" , 1 );
+            if ( first == string::npos ) {
+                responseCode = 400;
+                return;
+            }
+
+            string method = MiniWebServer::parseMethod( rq );
+            string dbname = url.substr( 1 , first - 1 );
+            string coll = url.substr( first + 1 );
+            string action = "";
+
+            string::size_type last = coll.find_last_of( "/" );
+            if ( last == string::npos ) {
+                action = coll;
+                coll = "_defaultCollection";
+            }
+            else {
+                action = coll.substr( last + 1 );
+                coll = coll.substr( 0 , last );
+            }
+
+            for ( string::size_type i=0; i<coll.size(); i++ )
+                if ( coll[i] == '/' )
+                    coll[i] = '.';
+
+            string fullns = MiniWebServer::urlDecode(dbname + "." + coll);
+
+            headers.push_back( (string)"x-action: " + action );
+            headers.push_back( (string)"x-ns: " + fullns );
+
+            bool html = false;
+
+            stringstream ss;
+
+            if ( method == "GET" ) {
+                responseCode = 200;
+                html = handleRESTQuery( fullns , action , params , responseCode , ss  );
+            }
+            else if ( method == "POST" ) {
+                responseCode = 201;
+                handlePost( fullns , MiniWebServer::body( rq ) , params , responseCode , ss  );
+            }
+            else {
+                responseCode = 400;
+                headers.push_back( "X_err: bad request" );
+                ss << "don't know how to handle a [" << method << "]";
+                out() << "don't know how to handle a [" << method << "]" << endl;
+            }
+
+            if( html )
+                headers.push_back("Content-Type: text/html;charset=utf-8");
+            else
+                headers.push_back("Content-Type: text/plain;charset=utf-8");
+
+            responseMsg = ss.str();
+        }
+
+        bool handleRESTQuery( string ns , string action , BSONObj & params , int & responseCode , stringstream & out ) {
+            Timer t;
+
+            int html = _getOption( params["html"] , 0 );
+            int skip = _getOption( params["skip"] , 0 );
+            int num  = _getOption( params["limit"] , _getOption( params["count" ] , 1000 ) ); // count is old, limit is new
+
+            int one = 0;
+            if ( params["one"].type() == String && tolower( params["one"].valuestr()[0] ) == 't' ) {
+                num = 1;
+                one = 1;
+            }
+
+            BSONObjBuilder queryBuilder;
+
+            BSONObjIterator i(params);
+            while ( i.more() ) {
+                BSONElement e = i.next();
+                string name = e.fieldName();
+                if ( ! name.find( "filter_" ) == 0 )
+                    continue;
+
+                string field = name.substr(7);
+                const char * val = e.valuestr();
+
+                char * temp;
+
+                // TODO: this is how i guess if something is a number.  pretty lame right now
+                double number = strtod( val , &temp );
+                if ( temp != val )
+                    queryBuilder.append( field , number );
+                else
+                    queryBuilder.append( field , val );
+            }
+
+            BSONObj query = queryBuilder.obj();
+            auto_ptr<DBClientCursor> cursor = db.query( ns.c_str() , query, num , skip );
+            uassert( 13085 , "query failed for dbwebserver" , cursor.get() );
+
+            if ( one ) {
+                if ( cursor->more() ) {
+                    BSONObj obj = cursor->next();
+                    out << obj.jsonString(Strict,html?1:0) << '\n';
+                }
+                else {
+                    responseCode = 404;
+                }
+                return html != 0;
+            }
+
+            if( html )  {
+                string title = string("query ") + ns;
+                out << start(title)
+                    << p(title)
+                    << "<pre>";
+            }
+            else {
+                out << "{\n";
+                out << "  \"offset\" : " << skip << ",\n";
+                out << "  \"rows\": [\n";
+            }
+
+            int howMany = 0;
+            while ( cursor->more() ) {
+                if ( howMany++ && html == 0 )
+                    out << " ,\n";
+                BSONObj obj = cursor->next();
+                if( html ) {
+                    if( out.tellp() > 4 * 1024 * 1024 ) {
+                        out << "Stopping output: more than 4MB returned and in html mode\n";
+                        break;
+                    }
+                    out << obj.jsonString(Strict, html?1:0) << "\n\n";
+                }
+                else {
+                    if( out.tellp() > 50 * 1024 * 1024 ) // 50MB limit - we are using ram
+                        break;
+                    out << "    " << obj.jsonString();
+                }
+            }
+
+            if( html ) {
+                out << "</pre>\n";
+                if( howMany == 0 ) out << p("Collection is empty");
+                out << _end();
+            }
+            else {
+                out << "\n  ],\n\n";
+                out << "  \"total_rows\" : " << howMany << " ,\n";
+                out << "  \"query\" : " << query.jsonString() << " ,\n";
+                out << "  \"millis\" : " << t.millis() << '\n';
+                out << "}\n";
+            }
+
+            return html != 0;
+        }
+
+        // TODO Generate id and revision per couch POST spec
+        void handlePost( string ns, const char *body, BSONObj& params, int & responseCode, stringstream & out ) {
+            try {
+                BSONObj obj = fromjson( body );
+                db.insert( ns.c_str(), obj );
+            }
+            catch ( ... ) {
+                responseCode = 400; // Bad Request.  Seems reasonable for now.
+                out << "{ \"ok\" : false }";
+                return;
+            }
+
+            responseCode = 201;
+            out << "{ \"ok\" : true }";
+        }
+
+        int _getOption( BSONElement e , int def ) {
+            if ( e.isNumber() )
+                return e.numberInt();
+            if ( e.type() == String )
+                return atoi( e.valuestr() );
+            return def;
+        }
+
+        DBDirectClient db;
+
+    } restHandler;
+
+    bool RestAdminAccess::haveAdminUsers() const {
+        readlocktryassert rl("admin.system.users", 10000);
+        Client::Context cx( "admin.system.users", dbpath, false );
+        return ! Helpers::isEmpty("admin.system.users", false);
+    }
+
+    BSONObj RestAdminAccess::getAdminUser( const string& username ) const {
+        Client::GodScope gs;
+        readlocktryassert rl("admin.system.users", 10000);
+        Client::Context cx( "admin.system.users" );
+        BSONObj user;
+        if ( Helpers::findOne( "admin.system.users" , BSON( "user" << username ) , user ) )
+            return user.copy();
+        return BSONObj();
+    }
+
+    class LowLevelMongodStatus : public WebStatusPlugin {
+    public:
+        LowLevelMongodStatus() : WebStatusPlugin( "overview" , 5 , "(only reported if can acquire read lock quickly)" ) {}
+
+        virtual void init() {}
+
+        void _gotLock( int millis , stringstream& ss ) {
+            ss << "<pre>\n";
+            ss << "time to get readlock: " << millis << "ms\n";
+            ss << "# databases: " << dbHolder().sizeInfo() << '\n';
+            ss << "# Cursors: " << ClientCursor::numCursors() << '\n';
+            ss << "replication: ";
+            if( *replInfo )
+                ss << "\nreplInfo:  " << replInfo << "\n\n";
+            if( replSet ) {
+                ss << a("", "see replSetGetStatus link top of page") << "--replSet </a>" << cmdLine._replSet;
+            }
+            if ( replAllDead )
+                ss << "\n<b>replication replAllDead=" << replAllDead << "</b>\n";
+            else {
+                ss << "\nmaster: " << replSettings.master << '\n';
+                ss << "slave:  " << replSettings.slave << '\n';
+                ss << '\n';
+            }
+
+            BackgroundOperation::dump(ss);
+            ss << "</pre>\n";
+        }
+
+        virtual void run( stringstream& ss ) {
+            Timer t;
+            readlocktry lk( "" , 300 );
+            if ( lk.got() ) {
+                _gotLock( t.millis() , ss );
+            }
+            else {
+                ss << "\n<b>timed out getting lock</b>\n";
+            }
+        }
+    } lowLevelMongodStatus;
+}
diff --git a/src/mongo/db/restapi.h b/src/mongo/db/restapi.h
new file mode 100644
index 00000000000..e5ac52083fe
--- /dev/null
+++ b/src/mongo/db/restapi.h
@@ -0,0 +1,34 @@
+/** @file restapi.h
+ */
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "../util/admin_access.h"
+
+namespace mongo {
+
+    class RestAdminAccess : public AdminAccess {
+    public:
+        virtual ~RestAdminAccess() { }
+
+        virtual bool haveAdminUsers() const;
+        virtual BSONObj getAdminUser( const string& username ) const;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/scanandorder.cpp b/src/mongo/db/scanandorder.cpp
new file mode 100644
index 00000000000..b5e282a5866
--- /dev/null
+++ b/src/mongo/db/scanandorder.cpp
@@ -0,0 +1,105 @@
+/* scanandorder.cpp
+   Order results (that aren't already indexes and in order.)
+*/
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "scanandorder.h"
+
+namespace mongo {
+
+    const unsigned ScanAndOrder::MaxScanAndOrderBytes = 32 * 1024 * 1024;
+
+    void ScanAndOrder::_add(BSONObj& k, BSONObj o, DiskLoc* loc) {
+        if (!loc) {
+            _best.insert(make_pair(k.getOwned(),o.getOwned()));
+        }
+        else {
+            BSONObjBuilder b;
+            b.appendElements(o);
+            b.append("$diskLoc", loc->toBSONObj());
+            _best.insert(make_pair(k.getOwned(), b.obj().getOwned()));
+        }
+    }
+
+    void ScanAndOrder::_addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i, DiskLoc* loc) {
+        /* todo : we don't correct _approxSize here. */
+        const BSONObj& worstBestKey = i->first;
+        int c = worstBestKey.woCompare(k, _order._spec.keyPattern);
+        if ( c > 0 ) {
+            // k is better, 'upgrade'
+            _best.erase(i);
+            _add(k, o, loc);
+        }
+    }
+
+
+    void ScanAndOrder::add(BSONObj o, DiskLoc* loc) {
+        assert( o.isValid() );
+        BSONObj k;
+        try {
+            k = _order.getKeyFromObject(o);
+        }
+        catch (UserException &e) {
+            if ( e.getCode() == ParallelArraysCode ) { // cannot get keys for parallel arrays
+                // fix lasterror text to be more accurate.
+                uasserted( 15925, "cannot sort with keys that are parallel arrays" );
+            }
+            else
+                throw;
+        }
+
+        if ( k.isEmpty() ) {
+            return;   
+        }
+        if ( (int) _best.size() < _limit ) {
+            _approxSize += k.objsize();
+            _approxSize += o.objsize();
+            
+            /* note : adjust when bson return limit adjusts. note this limit should be a bit higher. */
+            uassert( 10128 ,  "too much data for sort() with no index.  add an index or specify a smaller limit", _approxSize < MaxScanAndOrderBytes );
+            
+            _add(k, o, loc);
+            return;
+        }
+        BestMap::iterator i;
+        assert( _best.end() != _best.begin() );
+        i = _best.end();
+        i--;
+        _addIfBetter(k, o, i, loc);
+    }
+
+
+    void ScanAndOrder::fill(BufBuilder& b, Projection *filter, int& nout ) const {
+        int n = 0;
+        int nFilled = 0;
+        for ( BestMap::const_iterator i = _best.begin(); i != _best.end(); i++ ) {
+            n++;
+            if ( n <= _startFrom )
+                continue;
+            const BSONObj& o = i->second;
+            fillQueryResultFromObj(b, filter, o);
+            nFilled++;
+            if ( nFilled >= _limit )
+                break;
+            uassert( 10129 ,  "too much data for sort() with no index", b.len() < (int)MaxScanAndOrderBytes ); // appserver limit
+        }
+        nout = nFilled;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/scanandorder.h b/src/mongo/db/scanandorder.h
new file mode 100644
index 00000000000..33e76f61f67
--- /dev/null
+++ b/src/mongo/db/scanandorder.h
@@ -0,0 +1,111 @@
+/* scanandorder.h
+   Order results (that aren't already indexes and in order.)
+*/
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "indexkey.h"
+#include "queryutil.h"
+#include "projection.h"
+
+namespace mongo {
+
+    /* todo:
+       _ limit amount of data
+    */
+
+    class KeyType : boost::noncopyable {
+    public:
+        IndexSpec _spec;
+        FieldRangeVector _keyCutter;
+    public:
+        KeyType(BSONObj pattern, const FieldRangeSet &frs):
+        _spec((assert(!pattern.isEmpty()),pattern)),
+        _keyCutter(frs, _spec, 1) {
+        }
+
+        /**
+         * @return first key of the object that would be encountered while
+         * scanning index with keySpec 'pattern' using constraints 'frs', or
+         * BSONObj() if no such key.
+         */
+        BSONObj getKeyFromObject(BSONObj o) {
+            return _keyCutter.firstMatch(o);
+        }
+    };
+
+    /* todo:
+       _ respect limit
+       _ check for excess mem usage
+       _ response size limit from runquery; push it up a bit.
+    */
+
+    inline void fillQueryResultFromObj(BufBuilder& bb, Projection *filter, const BSONObj& js, DiskLoc* loc=NULL) {
+        if ( filter ) {
+            BSONObjBuilder b( bb );
+            filter->transform( js , b );
+            if (loc)
+                b.append("$diskLoc", loc->toBSONObj());
+            b.done();
+        }
+        else if (loc) {
+            BSONObjBuilder b( bb );
+            b.appendElements(js);
+            b.append("$diskLoc", loc->toBSONObj());
+            b.done();
+        }
+        else {
+            bb.appendBuf((void*) js.objdata(), js.objsize());
+        }
+    }
+
+    typedef multimap<BSONObj,BSONObj,BSONObjCmp> BestMap;
+    class ScanAndOrder {
+    public:
+        static const unsigned MaxScanAndOrderBytes;
+
+        ScanAndOrder(int startFrom, int limit, BSONObj order, const FieldRangeSet &frs) :
+            _best( BSONObjCmp( order ) ),
+            _startFrom(startFrom), _order(order, frs) {
+            _limit = limit > 0 ? limit + _startFrom : 0x7fffffff;
+            _approxSize = 0;
+        }
+
+        int size() const { return _best.size(); }
+
+        void add(BSONObj o, DiskLoc* loc);
+
+        /* scanning complete. stick the query result in b for n objects. */
+        void fill(BufBuilder& b, Projection *filter, int& nout ) const;
+
+    private:
+
+        void _add(BSONObj& k, BSONObj o, DiskLoc* loc);
+
+        void _addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i, DiskLoc* loc);
+
+        BestMap _best; // key -> full object
+        int _startFrom;
+        int _limit;   // max to send back.
+        KeyType _order;
+        unsigned _approxSize;
+
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/security.cpp b/src/mongo/db/security.cpp
new file mode 100644
index 00000000000..c9b9bb40326
--- /dev/null
+++ b/src/mongo/db/security.cpp
@@ -0,0 +1,106 @@
+// security.cpp
+
+/**
+ *    Copyright (C) 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "security.h"
+#include "security_common.h"
+#include "instance.h"
+#include "client.h"
+#include "curop-inl.h"
+#include "db.h"
+#include "dbhelpers.h"
+
+// this is the _mongod only_ implementation of security.h
+
+namespace mongo {
+
+    bool AuthenticationInfo::_warned = false;
+    /*
+    void AuthenticationInfo::print() const {
+        cout << "AuthenticationInfo: " << this << '\n';
+        for ( MA::const_iterator i=_dbs.begin(); i!=_dbs.end(); i++ ) {
+            cout << "\t" << i->first << "\t" << i->second.level << '\n';
+        }
+        cout << "END" << endl;
+    }
+    */
+
+    string AuthenticationInfo::getUser( const string& dbname ) const {
+        scoped_spinlock lk(_lock);
+
+        MA::const_iterator i = _dbs.find(dbname);
+        if ( i == _dbs.end() )
+            return "";
+
+        return i->second.user;
+    }
+
+
+    bool AuthenticationInfo::_isAuthorizedSpecialChecks( const string& dbname ) const {
+        if ( cc().isGod() ) 
+            return true;
+
+        if ( isLocalHost ) {
+            Client::GodScope gs;
+            Client::ReadContext ctx("admin.system.users");
+            BSONObj result;
+            if( ! Helpers::getSingleton("admin.system.users", result) ) {
+                if( ! _warned ) {
+                    // you could get a few of these in a race, but that's ok
+                    _warned = true;
+                    log() << "note: no users configured in admin.system.users, allowing localhost access" << endl;
+                }
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    bool CmdAuthenticate::getUserObj(const string& dbname, const string& user, BSONObj& userObj, string& pwd) {
+        if (user == internalSecurity.user) {
+            uassert(15889, "key file must be used to log in with internal user", cmdLine.keyFile);
+            pwd = internalSecurity.pwd;
+        }
+        else {
+            // static BSONObj userPattern = fromjson("{\"user\":1}");
+            string systemUsers = dbname + ".system.users";
+            // OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1");
+            {
+                BSONObjBuilder b;
+                b << "user" << user;
+                BSONObj query = b.done();
+                if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) {
+                    log() << "auth: couldn't find user " << user << ", " << systemUsers << endl;
+                    return false;
+                }
+            }
+
+            pwd = userObj.getStringField("pwd");
+        }
+        return true;
+    }
+
+    bool CmdLogout::run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        AuthenticationInfo *ai = cc().getAuthenticationInfo();
+        ai->logout(dbname);
+        return true;
+    }
+
+} // namespace mongo
+
diff --git a/src/mongo/db/security.h b/src/mongo/db/security.h
new file mode 100755
index 00000000000..f193f305def
--- /dev/null
+++ b/src/mongo/db/security.h
@@ -0,0 +1,113 @@
+// security.h
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "nonce.h"
+#include "concurrency.h"
+#include "security_common.h"
+#include "../util/concurrency/spin_lock.h"
+
+// this is used by both mongos and mongod
+
+namespace mongo {
+
+    /* 
+     * for a particular db
+     * levels
+     *     0 : none
+     *     1 : read
+     *     2 : write
+     */
+    struct Auth {
+        
+        enum Level { NONE = 0 , READ = 1 , WRITE = 2 };
+
+        Auth() { level = NONE; }
+        Level level;
+        string user;
+    };
+
+    class AuthenticationInfo : boost::noncopyable {
+    public:
+        bool isLocalHost;
+        
+        AuthenticationInfo(){ isLocalHost = false; }
+        ~AuthenticationInfo() {}
+
+        // -- modifiers ----
+        
+        void logout(const string& dbname ) {
+            scoped_spinlock lk(_lock);
+            _dbs.erase(dbname);
+        }
+        void authorize(const string& dbname , const string& user ) {
+            scoped_spinlock lk(_lock);
+            _dbs[dbname].level = Auth::WRITE;
+            _dbs[dbname].user = user;
+        }
+        void authorizeReadOnly(const string& dbname , const string& user ) {
+            scoped_spinlock lk(_lock);
+            _dbs[dbname].level = Auth::READ;
+            _dbs[dbname].user = user;
+        }
+        
+        // -- accessors ---
+
+        bool isAuthorized(const string& dbname) const { 
+            return _isAuthorized( dbname, Auth::WRITE ); 
+        }
+        
+        bool isAuthorizedReads(const string& dbname) const { 
+            return _isAuthorized( dbname, Auth::READ ); 
+        }
+        
+        /**
+         * @param lockType - this is from dbmutex 1 is write, 0 is read
+         */
+        bool isAuthorizedForLock(const string& dbname, int lockType ) const { 
+            return _isAuthorized( dbname , lockType > 0 ? Auth::WRITE : Auth::READ ); 
+        }
+
+        bool isAuthorizedForLevel( const string& dbname , Auth::Level level ) const {
+            return _isAuthorized( dbname , level );
+        }
+
+        string getUser( const string& dbname ) const;
+
+        void print() const;
+
+    protected:
+        /** takes a lock */
+        bool _isAuthorized(const string& dbname, Auth::Level level) const;
+
+        bool _isAuthorizedSingle_inlock(const string& dbname, Auth::Level level) const;
+        
+        /** cannot call this locked */
+        bool _isAuthorizedSpecialChecks( const string& dbname ) const ;
+
+    private:
+        mutable SpinLock _lock;
+
+        typedef map<string,Auth> MA;
+        MA _dbs; // dbname -> auth
+
+        static bool _warned;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/security_commands.cpp b/src/mongo/db/security_commands.cpp
new file mode 100644
index 00000000000..33dbd597c83
--- /dev/null
+++ b/src/mongo/db/security_commands.cpp
@@ -0,0 +1,150 @@
+// security_commands.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// security.cpp links with both dbgrid and db.  this file db only -- at least for now.
+
+// security.cpp
+
+#include "pch.h"
+#include "security.h"
+#include "../util/md5.hpp"
+#include "json.h"
+#include "pdfile.h"
+#include "db.h"
+#include "dbhelpers.h"
+#include "commands.h"
+#include "jsobj.h"
+#include "client.h"
+
+namespace mongo {
+
+    /* authentication
+
+       system.users contains
+         { user : <username>, pwd : <pwd_digest>, ... }
+
+       getnonce sends nonce to client
+
+       client then sends { authenticate:1, nonce64:<nonce_str>, user:<username>, key:<key> }
+
+       where <key> is md5(<nonce_str><username><pwd_digest_str>) as a string
+    */
+
+    boost::thread_specific_ptr<nonce64> lastNonce;
+
+    class CmdGetNonce : public Command {
+    public:
+        virtual bool requiresAuth() { return false; }
+        virtual bool logTheOp() { return false; }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        void help(stringstream& h) const { h << "internal"; }
+        virtual LockType locktype() const { return NONE; }
+        CmdGetNonce() : Command("getnonce") {}
+        bool run(const string&, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            nonce64 *n = new nonce64(Security::getNonce());
+            stringstream ss;
+            ss << hex << *n;
+            result.append("nonce", ss.str() );
+            lastNonce.reset(n);
+            return true;
+        }
+    } cmdGetNonce;
+
+    CmdLogout cmdLogout;
+
+    bool CmdAuthenticate::run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+        log() << " authenticate: " << cmdObj << endl;
+
+        string user = cmdObj.getStringField("user");
+        string key = cmdObj.getStringField("key");
+        string received_nonce = cmdObj.getStringField("nonce");
+
+        if( user.empty() || key.empty() || received_nonce.empty() ) {
+            log() << "field missing/wrong type in received authenticate command "
+                  << dbname
+                  << endl;
+            errmsg = "auth fails";
+            sleepmillis(10);
+            return false;
+        }
+
+        stringstream digestBuilder;
+
+        {
+            bool reject = false;
+            nonce64 *ln = lastNonce.release();
+            if ( ln == 0 ) {
+                reject = true;
+                log(1) << "auth: no lastNonce" << endl;
+            }
+            else {
+                digestBuilder << hex << *ln;
+                reject = digestBuilder.str() != received_nonce;
+                if ( reject ) log(1) << "auth: different lastNonce" << endl;
+            }
+
+            if ( reject ) {
+                log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << dbname << endl;
+                errmsg = "auth fails";
+                sleepmillis(30);
+                return false;
+            }
+        }
+
+        BSONObj userObj;
+        string pwd;
+        if (!getUserObj(dbname, user, userObj, pwd)) {
+            errmsg = "auth fails";
+            return false;
+        }
+
+        md5digest d;
+        {
+            digestBuilder << user << pwd;
+            string done = digestBuilder.str();
+
+            md5_state_t st;
+            md5_init(&st);
+            md5_append(&st, (const md5_byte_t *) done.c_str(), done.size());
+            md5_finish(&st, d);
+        }
+
+        string computed = digestToString( d );
+
+        if ( key != computed ) {
+            log() << "auth: key mismatch " << user << ", ns:" << dbname << endl;
+            errmsg = "auth fails";
+            return false;
+        }
+
+        bool readOnly = userObj["readOnly"].trueValue();
+        authenticate(dbname, user, readOnly );
+        
+        
+        result.append( "dbname" , dbname );
+        result.append( "user" , user );
+        result.appendBool( "readOnly" , readOnly );
+        
+
+        return true;
+    }
+
+    CmdAuthenticate cmdAuthenticate;
+
+} // namespace mongo
diff --git a/src/mongo/db/security_common.cpp b/src/mongo/db/security_common.cpp
new file mode 100644
index 00000000000..a480919c27e
--- /dev/null
+++ b/src/mongo/db/security_common.cpp
@@ -0,0 +1,148 @@
+// security_common.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * This file contains inter-mongo instance security helpers.  Due to the
+ * requirement that it be possible to compile this into mongos and mongod, it
+ * should not depend on much external stuff.
+ */
+
+#include "pch.h"
+#include "security.h"
+#include "security_common.h"
+#include "../client/dbclient.h"
+#include "commands.h"
+#include "nonce.h"
+#include "../util/md5.hpp"
+#include "client_common.h"
+#include <sys/stat.h>
+
+namespace mongo {
+
+    bool noauth = true;
+    AuthInfo internalSecurity;
+
+    bool setUpSecurityKey(const string& filename) {
+        struct stat stats;
+
+        // check obvious file errors
+        if (stat(filename.c_str(), &stats) == -1) {
+            log() << "error getting file " << filename << ": " << strerror(errno) << endl;
+            return false;
+        }
+
+#if !defined(_WIN32)
+        // check permissions: must be X00, where X is >= 4
+        if ((stats.st_mode & (S_IRWXG|S_IRWXO)) != 0) {
+            log() << "permissions on " << filename << " are too open" << endl;
+            return false;
+        }
+#endif
+
+        const unsigned long long fileLength = stats.st_size;
+        if (fileLength < 6 || fileLength > 1024) {
+            log() << " key file " << filename << " has length " << stats.st_size
+                  << ", must be between 6 and 1024 chars" << endl;
+            return false;
+        }
+
+        FILE* file = fopen( filename.c_str(), "rb" );
+        if (!file) {
+            log() << "error opening file: " << filename << ": " << strerror(errno) << endl;
+            return false;
+        }
+
+        string str = "";
+
+        // strip key file
+        unsigned long long read = 0;
+        while (read < fileLength) {
+            char buf;
+            int readLength = fread(&buf, 1, 1, file);
+            if (readLength < 1) {
+                log() << "error reading file " << filename << endl;
+                return false;
+            }
+            read++;
+
+            // check for whitespace
+            if ((buf >= '\x09' && buf <= '\x0D') || buf == ' ') {
+                continue;
+            }
+
+            // check valid base64
+            if ((buf < 'A' || buf > 'Z') && (buf < 'a' || buf > 'z') && (buf < '0' || buf > '9') && buf != '+' && buf != '/') {
+                log() << "invalid char in key file " << filename << ": " << buf << endl;
+                return false;
+            }
+
+            str += buf;
+        }
+
+        if (str.size() < 6) {
+            log() << "security key must be at least 6 characters" << endl;
+            return false;
+        }
+
+        log(1) << "security key: " << str << endl;
+
+        // createPWDigest should really not be a member func
+        DBClientConnection conn;
+        internalSecurity.pwd = conn.createPasswordDigest(internalSecurity.user, str);
+
+        return true;
+    }
+
+    void CmdAuthenticate::authenticate(const string& dbname, const string& user, const bool readOnly) {
+        ClientBasic* c = ClientBasic::getCurrent();
+        assert(c);
+        AuthenticationInfo *ai = c->getAuthenticationInfo();
+
+        if ( readOnly ) {
+            ai->authorizeReadOnly( dbname , user );
+        }
+        else {
+            ai->authorize( dbname , user );
+        }
+    }
+
+
+    bool AuthenticationInfo::_isAuthorized(const string& dbname, Auth::Level level) const {
+        {
+            scoped_spinlock lk(_lock);
+
+            if ( _isAuthorizedSingle_inlock( dbname , level ) )
+                return true;
+
+            if ( noauth )
+                return true;
+
+            if ( _isAuthorizedSingle_inlock( "admin" , level ) )
+                return true;
+
+            if ( _isAuthorizedSingle_inlock( "local" , level ) )
+                return true;
+        }
+        return _isAuthorizedSpecialChecks( dbname );
+    }
+
+    bool AuthenticationInfo::_isAuthorizedSingle_inlock(const string& dbname, Auth::Level level) const {
+        MA::const_iterator i = _dbs.find(dbname);
+        return i != _dbs.end() && i->second.level >= level;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/security_common.h b/src/mongo/db/security_common.h
new file mode 100644
index 00000000000..6615c6e573e
--- /dev/null
+++ b/src/mongo/db/security_common.h
@@ -0,0 +1,85 @@
+// security_common.h
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "commands.h"
+#include "concurrency.h"
+#include "../util/concurrency/spin_lock.h"
+
+namespace mongo {
+
+    /**
+     * Internal secret key info.
+     */
+    struct AuthInfo {
+        AuthInfo() {
+            user = "__system";
+        }
+        string user;
+        string pwd;
+    };
+
+    // --noauth cmd line option
+    extern bool noauth;
+    extern AuthInfo internalSecurity;
+
+    /**
+     * This method checks the validity of filename as a security key, hashes its
+     * contents, and stores it in the internalSecurity variable.  Prints an
+     * error message to the logs if there's an error.
+     * @param filename the file containing the key
+     * @return if the key was successfully stored
+     */
+    bool setUpSecurityKey(const string& filename);
+
+    class CmdAuthenticate : public Command {
+    public:
+        virtual bool requiresAuth() { return false; }
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        virtual LockType locktype() const { return READ; }
+        virtual void help(stringstream& ss) const { ss << "internal"; }
+        CmdAuthenticate() : Command("authenticate") {}
+        bool run(const string& dbname , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+        void authenticate(const string& dbname, const string& user, const bool readOnly);
+    private:
+        bool getUserObj(const string& dbname, const string& user, BSONObj& userObj, string& pwd);
+    };
+    
+    extern CmdAuthenticate cmdAuthenticate;
+
+    class CmdLogout : public Command {
+    public:
+        virtual bool logTheOp() {
+            return false;
+        }
+        virtual bool slaveOk() const {
+            return true;
+        }
+        void help(stringstream& h) const { h << "de-authenticate"; }
+        virtual LockType locktype() const { return NONE; }
+        CmdLogout() : Command("logout") {}
+        bool run(const string& dbname , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl);
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/stats/counters.cpp b/src/mongo/db/stats/counters.cpp
new file mode 100644
index 00000000000..889e8a86c4c
--- /dev/null
+++ b/src/mongo/db/stats/counters.cpp
@@ -0,0 +1,207 @@
+// counters.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "pch.h"
+#include "../jsobj.h"
+#include "counters.h"
+
+namespace mongo {
+
+    OpCounters::OpCounters() {
+        int zero = 0;
+
+        BSONObjBuilder b;
+        b.append( "insert" , zero );
+        b.append( "query" , zero );
+        b.append( "update" , zero );
+        b.append( "delete" , zero );
+        b.append( "getmore" , zero );
+        b.append( "command" , zero );
+        _obj = b.obj();
+
+        _insert = (AtomicUInt*)_obj["insert"].value();
+        _query = (AtomicUInt*)_obj["query"].value();
+        _update = (AtomicUInt*)_obj["update"].value();
+        _delete = (AtomicUInt*)_obj["delete"].value();
+        _getmore = (AtomicUInt*)_obj["getmore"].value();
+        _command = (AtomicUInt*)_obj["command"].value();
+    }
+
+    void OpCounters::gotOp( int op , bool isCommand ) {
+        switch ( op ) {
+        case dbInsert: /*gotInsert();*/ break; // need to handle multi-insert
+        case dbQuery:
+            if ( isCommand )
+                gotCommand();
+            else
+                gotQuery();
+            break;
+
+        case dbUpdate: gotUpdate(); break;
+        case dbDelete: gotDelete(); break;
+        case dbGetMore: gotGetMore(); break;
+        case dbKillCursors:
+        case opReply:
+        case dbMsg:
+            break;
+        default: log() << "OpCounters::gotOp unknown op: " << op << endl;
+        }
+    }
+
+    BSONObj& OpCounters::getObj() {
+        const unsigned MAX = 1 << 30;
+        RARELY {
+            bool wrap =
+            _insert->get() > MAX ||
+            _query->get() > MAX ||
+            _update->get() > MAX ||
+            _delete->get() > MAX ||
+            _getmore->get() > MAX ||
+            _command->get() > MAX;
+
+            if ( wrap ) {
+                _insert->zero();
+                _query->zero();
+                _update->zero();
+                _delete->zero();
+                _getmore->zero();
+                _command->zero();
+            }
+
+        }
+        return _obj;
+    }
+
+    IndexCounters::IndexCounters() {
+        _memSupported = _pi.blockCheckSupported();
+
+        _btreeMemHits = 0;
+        _btreeMemMisses = 0;
+        _btreeAccesses = 0;
+
+
+        _maxAllowed = ( numeric_limits< long long >::max() ) / 2;
+        _resets = 0;
+
+        _sampling = 0;
+        _samplingrate = 100;
+    }
+
+    void IndexCounters::append( BSONObjBuilder& b ) {
+        if ( ! _memSupported ) {
+            b.append( "note" , "not supported on this platform" );
+            return;
+        }
+
+        BSONObjBuilder bb( b.subobjStart( "btree" ) );
+        bb.appendNumber( "accesses" , _btreeAccesses );
+        bb.appendNumber( "hits" , _btreeMemHits );
+        bb.appendNumber( "misses" , _btreeMemMisses );
+
+        bb.append( "resets" , _resets );
+
+        bb.append( "missRatio" , (_btreeAccesses ? (_btreeMemMisses / (double)_btreeAccesses) : 0) );
+
+        bb.done();
+
+        if ( _btreeAccesses > _maxAllowed ) {
+            _btreeAccesses = 0;
+            _btreeMemMisses = 0;
+            _btreeMemHits = 0;
+            _resets++;
+        }
+    }
+
+    FlushCounters::FlushCounters()
+        : _total_time(0)
+        , _flushes(0)
+        , _last()
+    {}
+
+    void FlushCounters::flushed(int ms) {
+        _flushes++;
+        _total_time += ms;
+        _last_time = ms;
+        _last = jsTime();
+    }
+
+    void FlushCounters::append( BSONObjBuilder& b ) {
+        b.appendNumber( "flushes" , _flushes );
+        b.appendNumber( "total_ms" , _total_time );
+        b.appendNumber( "average_ms" , (_flushes ? (_total_time / double(_flushes)) : 0.0) );
+        b.appendNumber( "last_ms" , _last_time );
+        b.append("last_finished", _last);
+    }
+
+
+    void GenericCounter::hit( const string& name , int count ) {
+        scoped_lock lk( _mutex );
+        _counts[name]++;
+    }
+
+    BSONObj GenericCounter::getObj() {
+        BSONObjBuilder b(128);
+        {
+            mongo::mutex::scoped_lock lk( _mutex );
+            for ( map<string,long long>::iterator i=_counts.begin(); i!=_counts.end(); i++ ) {
+                b.appendNumber( i->first , i->second );
+            }
+        }
+        return b.obj();
+    }
+
+
+    void NetworkCounter::hit( long long bytesIn , long long bytesOut ) {
+        const long long MAX = 1ULL << 60;
+
+        // don't care about the race as its just a counter
+        bool overflow = _bytesIn > MAX || _bytesOut > MAX;
+
+        if ( overflow ) {
+            _lock.lock();
+            _overflows++;
+            _bytesIn = bytesIn;
+            _bytesOut = bytesOut;
+            _requests = 1;
+            _lock.unlock();
+        }
+        else {
+            _lock.lock();
+            _bytesIn += bytesIn;
+            _bytesOut += bytesOut;
+            _requests++;
+            _lock.unlock();
+        }
+    }
+
+    void NetworkCounter::append( BSONObjBuilder& b ) {
+        _lock.lock();
+        b.appendNumber( "bytesIn" , _bytesIn );
+        b.appendNumber( "bytesOut" , _bytesOut );
+        b.appendNumber( "numRequests" , _requests );
+        _lock.unlock();
+    }
+
+
+    OpCounters globalOpCounters;
+    OpCounters replOpCounters;
+    IndexCounters globalIndexCounters;
+    FlushCounters globalFlushCounters;
+    NetworkCounter networkCounter;
+
+}
diff --git a/src/mongo/db/stats/counters.h b/src/mongo/db/stats/counters.h
new file mode 100644
index 00000000000..0cb29aa49aa
--- /dev/null
+++ b/src/mongo/db/stats/counters.h
@@ -0,0 +1,159 @@
+// counters.h
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "../../util/net/message.h"
+#include "../../util/processinfo.h"
+#include "../../util/concurrency/spin_lock.h"
+
+namespace mongo {
+
+    /**
+     * for storing operation counters
+     * note: not thread safe.  ok with that for speed
+     */
+    class OpCounters {
+    public:
+
+        OpCounters();
+
+        AtomicUInt * getInsert() { return _insert; }
+        AtomicUInt * getQuery() { return _query; }
+        AtomicUInt * getUpdate() { return _update; }
+        AtomicUInt * getDelete() { return _delete; }
+        AtomicUInt * getGetMore() { return _getmore; }
+        AtomicUInt * getCommand() { return _command; }
+
+        void incInsertInWriteLock(int n) { _insert->x += n; }
+        void gotInsert() { _insert[0]++; }
+        void gotQuery() { _query[0]++; }
+        void gotUpdate() { _update[0]++; }
+        void gotDelete() { _delete[0]++; }
+        void gotGetMore() { _getmore[0]++; }
+        void gotCommand() { _command[0]++; }
+
+        void gotOp( int op , bool isCommand );
+
+        BSONObj& getObj();
+
+    private:
+        BSONObj _obj;
+
+        // todo: there will be a lot of cache line contention on these.  need to do something 
+        //       else eventually.
+        AtomicUInt * _insert;
+        AtomicUInt * _query;
+        AtomicUInt * _update;
+        AtomicUInt * _delete;
+        AtomicUInt * _getmore;
+        AtomicUInt * _command;
+    };
+
+    extern OpCounters globalOpCounters;
+    extern OpCounters replOpCounters;
+
+
+    class IndexCounters {
+    public:
+        IndexCounters();
+
+        // used without a mutex intentionally (can race)
+        void btree( char * node ) {
+            if ( ! _memSupported )
+                return;
+            if ( _sampling++ % _samplingrate )
+                return;
+            btree( _pi.blockInMemory( node ) );
+        }
+
+        void btree( bool memHit ) {
+            if ( memHit )
+                _btreeMemHits++;
+            else
+                _btreeMemMisses++;
+            _btreeAccesses++;
+        }
+        void btreeHit() { _btreeMemHits++; _btreeAccesses++; }
+        void btreeMiss() { _btreeMemMisses++; _btreeAccesses++; }
+
+        void append( BSONObjBuilder& b );
+
+    private:
+        ProcessInfo _pi;
+        bool _memSupported;
+
+        int _sampling;
+        int _samplingrate;
+
+        int _resets;
+        long long _maxAllowed;
+
+        long long _btreeMemMisses;
+        long long _btreeMemHits;
+        long long _btreeAccesses;
+    };
+
+    extern IndexCounters globalIndexCounters;
+
+    class FlushCounters {
+    public:
+        FlushCounters();
+
+        void flushed(int ms);
+
+        void append( BSONObjBuilder& b );
+
+    private:
+        long long _total_time;
+        long long _flushes;
+        int _last_time;
+        Date_t _last;
+    };
+
+    extern FlushCounters globalFlushCounters;
+
+
+    class GenericCounter {
+    public:
+        GenericCounter() : _mutex("GenericCounter") { }
+        void hit( const string& name , int count=0 );
+        BSONObj getObj();
+    private:
+        map<string,long long> _counts; // TODO: replace with thread safe map
+        mongo::mutex _mutex;
+    };
+
+    class NetworkCounter {
+    public:
+        NetworkCounter() : _bytesIn(0), _bytesOut(0), _requests(0), _overflows(0) {}
+        void hit( long long bytesIn , long long bytesOut );
+        void append( BSONObjBuilder& b );
+    private:
+        long long _bytesIn;
+        long long _bytesOut;
+        long long _requests;
+
+        long long _overflows;
+
+        SpinLock _lock;
+    };
+
+    extern NetworkCounter networkCounter;
+}
diff --git a/src/mongo/db/stats/fine_clock.h b/src/mongo/db/stats/fine_clock.h
new file mode 100644
index 00000000000..02600e718c4
--- /dev/null
+++ b/src/mongo/db/stats/fine_clock.h
@@ -0,0 +1,67 @@
+// fine_clock.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef DB_STATS_FINE_CLOCK_HEADER
+#define DB_STATS_FINE_CLOCK_HEADER
+
+#include <time.h>  // struct timespec
+
+namespace mongo {
+
+    /**
+     * This is a nano-second precision clock. We're skipping the
+     * harware TSC in favor of clock_gettime() which in some systems
+     * does not involve a trip to the OS (VDSO).
+     *
+     * We're exporting a type WallTime that is and should remain
+     * opaque. The business of getting accurate time is still ongoing
+     * and we may change the internal representation of this class.
+     * (http://lwn.net/Articles/388188/)
+     *
+     * Really, you shouldn't be using this class in hot code paths for
+     * platforms you're not sure whether the overhead is low.
+     */
+    class FineClock {
+    public:
+
+        typedef timespec WallTime;
+
+        static WallTime now() {
+            struct timespec ts;
+            clock_gettime(CLOCK_MONOTONIC, &ts);
+            return ts;
+        }
+
+        static uint64_t diffInNanos( WallTime end, WallTime start ) {
+            uint64_t diff;
+            if ( end.tv_nsec < start.tv_nsec ) {
+                diff = 1000000000 * ( end.tv_sec - start.tv_sec - 1);
+                diff += 1000000000 + end.tv_nsec - start.tv_nsec;
+            }
+            else {
+                diff = 1000000000 * ( end.tv_sec - start.tv_sec );
+                diff += end.tv_nsec - start.tv_nsec;
+            }
+            return diff;
+        }
+
+    };
+}
+
+#endif  // DB_STATS_FINE_CLOCK_HEADER
+
diff --git a/src/mongo/db/stats/service_stats.cpp b/src/mongo/db/stats/service_stats.cpp
new file mode 100644
index 00000000000..d69147fe969
--- /dev/null
+++ b/src/mongo/db/stats/service_stats.cpp
@@ -0,0 +1,68 @@
+// service_stats.cpp
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <sstream>
+
+#include "../../util/histogram.h"
+#include "service_stats.h"
+
+namespace mongo {
+
+    using std::ostringstream;
+
+    ServiceStats::ServiceStats() {
+        // Time histogram covers up to 128msec in exponential intervals
+        // starting at 125usec.
+        Histogram::Options timeOpts;
+        timeOpts.numBuckets = 12;
+        timeOpts.bucketSize = 125;
+        timeOpts.exponential = true;
+        _timeHistogram = new Histogram( timeOpts );
+
+        // Space histogram covers up to 1MB in exponentialintervals starting
+        // at 1K.
+        Histogram::Options spaceOpts;
+        spaceOpts.numBuckets = 12;
+        spaceOpts.bucketSize = 1024;
+        spaceOpts.exponential = true;
+        _spaceHistogram = new Histogram( spaceOpts );
+    }
+
+    ServiceStats::~ServiceStats() {
+        delete _timeHistogram;
+        delete _spaceHistogram;
+    }
+
+    void ServiceStats::logResponse( uint64_t duration, uint64_t bytes ) {
+        _spinLock.lock();
+        _timeHistogram->insert( duration / 1000 /* in usecs */ );
+        _spaceHistogram->insert( bytes );
+        _spinLock.unlock();
+    }
+
+    string ServiceStats::toHTML() const {
+        ostringstream res ;
+        res << "Cumulative wire stats\n"
+            << "Response times\n" << _timeHistogram->toHTML()
+            << "Response sizes\n" << _spaceHistogram->toHTML()
+            << '\n';
+
+        return res.str();
+    }
+
+}  // mongo
diff --git a/src/mongo/db/stats/service_stats.h b/src/mongo/db/stats/service_stats.h
new file mode 100644
index 00000000000..5b0e75fdcb9
--- /dev/null
+++ b/src/mongo/db/stats/service_stats.h
@@ -0,0 +1,66 @@
+// service_stats.h
+
+/**
+*    Copyright (C) 2010 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef DB_STATS_SERVICE_STATS_HEADER
+#define DB_STATS_SERVICE_STATS_HEADER
+
+#include <string>
+
+#include "../../util/concurrency/spin_lock.h"
+
+namespace mongo {
+
+    using std::string;
+
+    class Histogram;
+
+    /**
+     * ServiceStats keeps track of the time a request/response message
+     * took inside a service as well as the size of the response
+     * generated.
+     */
+    class ServiceStats {
+    public:
+        ServiceStats();
+        ~ServiceStats();
+
+        /**
+         * Record the 'duration' in microseconds a request/response
+         * message took and the size in bytes of the generated
+         * response.
+         */
+        void logResponse( uint64_t duration, uint64_t bytes );
+
+        /**
+         * Render the histogram as string that can be used inside an
+         * HTML doc.
+         */
+        string toHTML() const;
+
+    private:
+        SpinLock   _spinLock;         // protects state below
+        Histogram* _timeHistogram;
+        Histogram* _spaceHistogram;
+
+        ServiceStats( const ServiceStats& );
+        ServiceStats operator=( const ServiceStats& );
+    };
+
+}  // namespace mongo
+
+#endif  //  DB_STATS_SERVICE_STATS_HEADER
diff --git a/src/mongo/db/stats/snapshots.cpp b/src/mongo/db/stats/snapshots.cpp
new file mode 100644
index 00000000000..900cc4ff1ad
--- /dev/null
+++ b/src/mongo/db/stats/snapshots.cpp
@@ -0,0 +1,227 @@
+// snapshots.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+#include "snapshots.h"
+#include "../client.h"
+#include "../clientcursor.h"
+#include "../dbwebserver.h"
+#include "../../util/mongoutils/html.h"
+
+/**
+   handles snapshotting performance metrics and other such things
+ */
+namespace mongo {
+
+    void SnapshotData::takeSnapshot() {
+        _created = curTimeMicros64();
+        _globalUsage = Top::global.getGlobalData();
+//        _totalWriteLockedTime = d.dbMutex.info().getTimeLocked();
+        Top::global.cloneMap(_usage);
+    }
+
+    SnapshotDelta::SnapshotDelta( const SnapshotData& older , const SnapshotData& newer )
+        : _older( older ) , _newer( newer ) {
+        assert( _newer._created > _older._created );
+        _elapsed = _newer._created - _older._created;
+    }
+
+    Top::CollectionData SnapshotDelta::globalUsageDiff() {
+        return Top::CollectionData( _older._globalUsage , _newer._globalUsage );
+    }
+    Top::UsageMap SnapshotDelta::collectionUsageDiff() {
+        assert( _newer._created > _older._created );
+        Top::UsageMap u;
+
+        for ( Top::UsageMap::const_iterator i=_newer._usage.begin(); i != _newer._usage.end(); i++ ) {
+            Top::UsageMap::const_iterator j = _older._usage.find(i->first);
+            if (j != _older._usage.end())
+                u[i->first] = Top::CollectionData( j->second , i->second );
+            else
+                u[i->first] = i->second;
+        }
+        return u;
+    }
+
+    Snapshots::Snapshots(int n)
+        : _lock("Snapshots"), _n(n)
+        , _snapshots(new SnapshotData[n])
+        , _loc(0)
+        , _stored(0)
+    {}
+
+    const SnapshotData* Snapshots::takeSnapshot() {
+        scoped_lock lk(_lock);
+        _loc = ( _loc + 1 ) % _n;
+        _snapshots[_loc].takeSnapshot();
+        if ( _stored < _n )
+            _stored++;
+        return &_snapshots[_loc];
+    }
+
+    auto_ptr<SnapshotDelta> Snapshots::computeDelta( int numBack ) {
+        scoped_lock lk(_lock);
+        auto_ptr<SnapshotDelta> p;
+        if ( numBack < numDeltas() )
+            p.reset( new SnapshotDelta( getPrev(numBack+1) , getPrev(numBack) ) );
+        return p;
+    }
+
+    const SnapshotData& Snapshots::getPrev( int numBack ) {
+        int x = _loc - numBack;
+        if ( x < 0 )
+            x += _n;
+        return _snapshots[x];
+    }
+
+    void Snapshots::outputLockInfoHTML( stringstream& ss ) {
+        scoped_lock lk(_lock);
+        ss << "\n<div>";
+        for ( int i=0; i<numDeltas(); i++ ) {
+            SnapshotDelta d( getPrev(i+1) , getPrev(i) );
+            unsigned e = (unsigned) d.elapsed() / 1000;
+            ss << (unsigned)(100*d.percentWriteLocked());
+            if( e < 3900 || e > 4100 )
+                ss << '(' << e / 1000.0 << "s)";
+            ss << ' ';
+        }
+        ss << "</div>\n";
+    }
+
+    void SnapshotThread::run() {
+        Client::initThread("snapshotthread");
+        Client& client = cc();
+
+        long long numLoops = 0;
+
+        const SnapshotData* prev = 0;
+
+        while ( ! inShutdown() ) {
+            try {
+                const SnapshotData* s = statsSnapshots.takeSnapshot();
+
+                if ( prev && cmdLine.cpu ) {
+                    unsigned long long elapsed = s->_created - prev->_created;
+                    SnapshotDelta d( *prev , *s );
+                    log() << "cpu: elapsed:" << (elapsed/1000) <<"  writelock: " << (int)(100*d.percentWriteLocked()) << "%" << endl;
+                }
+
+                prev = s;
+            }
+            catch ( std::exception& e ) {
+                log() << "ERROR in SnapshotThread: " << e.what() << endl;
+            }
+
+            numLoops++;
+            sleepsecs(4);
+        }
+
+        client.shutdown();
+    }
+
+    using namespace mongoutils::html;
+
+    class WriteLockStatus : public WebStatusPlugin {
+    public:
+        WriteLockStatus() : WebStatusPlugin( "write lock" , 51 , "% time in write lock, by 4 sec periods" ) {}
+        virtual void init() {}
+
+        virtual void run( stringstream& ss ) {
+            statsSnapshots.outputLockInfoHTML( ss );
+
+            ss << "<a "
+               "href=\"http://www.mongodb.org/pages/viewpage.action?pageId=7209296\" "
+               "title=\"snapshot: was the db in the write lock when this page was generated?\">";
+            ss << "write locked now:</a> " << (d.dbMutex.info().isLocked() ? "true" : "false") << "\n";
+        }
+
+    } writeLockStatus;
+
+    class DBTopStatus : public WebStatusPlugin {
+    public:
+        DBTopStatus() : WebStatusPlugin( "dbtop" , 50 , "(occurrences|percent of elapsed)" ) {}
+
+        void display( stringstream& ss , double elapsed , const Top::UsageData& usage ) {
+            ss << "<td>";
+            ss << usage.count;
+            ss << "</td><td>";
+            double per = 100 * ((double)usage.time)/elapsed;
+            if( per == (int) per )
+                ss << (int) per;
+            else
+                ss << setprecision(1) << fixed << per;
+            ss << '%';
+            ss << "</td>";
+        }
+
+        void display( stringstream& ss , double elapsed , const string& ns , const Top::CollectionData& data ) {
+            if ( ns != "TOTAL" && data.total.count == 0 )
+                return;
+            ss << "<tr><th>" << ns << "</th>";
+
+            display( ss , elapsed , data.total );
+
+            display( ss , elapsed , data.readLock );
+            display( ss , elapsed , data.writeLock );
+
+            display( ss , elapsed , data.queries );
+            display( ss , elapsed , data.getmore );
+            display( ss , elapsed , data.insert );
+            display( ss , elapsed , data.update );
+            display( ss , elapsed , data.remove );
+
+            ss << "</tr>\n";
+        }
+
+        void run( stringstream& ss ) {
+            auto_ptr<SnapshotDelta> delta = statsSnapshots.computeDelta();
+            if ( ! delta.get() )
+                return;
+
+            ss << "<table border=1 cellpadding=2 cellspacing=0>";
+            ss << "<tr align='left'><th>";
+            ss << a("http://www.mongodb.org/display/DOCS/Developer+FAQ#DeveloperFAQ-What%27sa%22namespace%22%3F", "namespace") <<
+               "NS</a></th>"
+               "<th colspan=2>total</th>"
+               "<th colspan=2>Reads</th>"
+               "<th colspan=2>Writes</th>"
+               "<th colspan=2>Queries</th>"
+               "<th colspan=2>GetMores</th>"
+               "<th colspan=2>Inserts</th>"
+               "<th colspan=2>Updates</th>"
+               "<th colspan=2>Removes</th>";
+            ss << "</tr>\n";
+
+            display( ss , (double) delta->elapsed() , "TOTAL" , delta->globalUsageDiff() );
+
+            Top::UsageMap usage = delta->collectionUsageDiff();
+            for ( Top::UsageMap::iterator i=usage.begin(); i != usage.end(); i++ ) {
+                display( ss , (double) delta->elapsed() , i->first , i->second );
+            }
+
+            ss << "</table>";
+
+        }
+
+        virtual void init() {}
+    } dbtopStatus;
+
+    Snapshots statsSnapshots;
+    SnapshotThread snapshotThread;
+
+}
diff --git a/src/mongo/db/stats/snapshots.h b/src/mongo/db/stats/snapshots.h
new file mode 100644
index 00000000000..d9b8e5eb901
--- /dev/null
+++ b/src/mongo/db/stats/snapshots.h
@@ -0,0 +1,114 @@
+// snapshots.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+#include "../../pch.h"
+#include "../jsobj.h"
+#include "top.h"
+#include "../../util/background.h"
+
+/**
+   handles snapshotting performance metrics and other such things
+ */
+namespace mongo {
+
+    class SnapshotThread;
+
+    /**
+     * stores a point in time snapshot
+     * i.e. all counters at a given time
+     */
+    class SnapshotData {
+        void takeSnapshot();
+
+        unsigned long long _created;
+        Top::CollectionData _globalUsage;
+        unsigned long long _totalWriteLockedTime; // micros of total time locked
+        Top::UsageMap _usage;
+
+        friend class SnapshotThread;
+        friend class SnapshotDelta;
+        friend class Snapshots;
+    };
+
+    /**
+     * contains performance information for a time period
+     */
+    class SnapshotDelta {
+    public:
+        SnapshotDelta( const SnapshotData& older , const SnapshotData& newer );
+
+        unsigned long long start() const {
+            return _older._created;
+        }
+
+        unsigned long long elapsed() const {
+            return _elapsed;
+        }
+
+        unsigned long long timeInWriteLock() const {
+            return _newer._totalWriteLockedTime - _older._totalWriteLockedTime;
+        }
+        double percentWriteLocked() const {
+            double e = (double) elapsed();
+            double w = (double) timeInWriteLock();
+            return w/e;
+        }
+
+        Top::CollectionData globalUsageDiff();
+        Top::UsageMap collectionUsageDiff();
+
+    private:
+        const SnapshotData& _older;
+        const SnapshotData& _newer;
+
+        unsigned long long _elapsed;
+    };
+
+    class Snapshots {
+    public:
+        Snapshots(int n=100);
+
+        const SnapshotData* takeSnapshot();
+
+        int numDeltas() const { return _stored-1; }
+
+        const SnapshotData& getPrev( int numBack = 0 );
+        auto_ptr<SnapshotDelta> computeDelta( int numBack = 0 );
+
+
+        void outputLockInfoHTML( stringstream& ss );
+    private:
+        mongo::mutex _lock;
+        int _n;
+        boost::scoped_array<SnapshotData> _snapshots;
+        int _loc;
+        int _stored;
+    };
+
+    class SnapshotThread : public BackgroundJob {
+    public:
+        virtual string name() const { return "snapshot"; }
+        void run();
+    };
+
+    extern Snapshots statsSnapshots;
+    extern SnapshotThread snapshotThread;
+
+
+}
diff --git a/src/mongo/db/stats/top.cpp b/src/mongo/db/stats/top.cpp
new file mode 100644
index 00000000000..f5b6ee42f1c
--- /dev/null
+++ b/src/mongo/db/stats/top.cpp
@@ -0,0 +1,183 @@
+// top.cpp
+/*
+ *    Copyright (C) 2010 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "pch.h"
+#include "top.h"
+#include "../../util/net/message.h"
+#include "../commands.h"
+
+namespace mongo {
+
+    Top::UsageData::UsageData( const UsageData& older , const UsageData& newer ) {
+        // this won't be 100% accurate on rollovers and drop(), but at least it won't be negative
+        time  = (newer.time  >= older.time)  ? (newer.time  - older.time)  : newer.time;
+        count = (newer.count >= older.count) ? (newer.count - older.count) : newer.count;
+    }
+
+    Top::CollectionData::CollectionData( const CollectionData& older , const CollectionData& newer )
+        : total( older.total , newer.total ) ,
+          readLock( older.readLock , newer.readLock ) ,
+          writeLock( older.writeLock , newer.writeLock ) ,
+          queries( older.queries , newer.queries ) ,
+          getmore( older.getmore , newer.getmore ) ,
+          insert( older.insert , newer.insert ) ,
+          update( older.update , newer.update ) ,
+          remove( older.remove , newer.remove ),
+          commands( older.commands , newer.commands ) {
+
+    }
+
+    void Top::record( const string& ns , int op , int lockType , long long micros , bool command ) {
+        if ( ns[0] == '?' )
+            return;
+
+        //cout << "record: " << ns << "\t" << op << "\t" << command << endl;
+        scoped_lock lk(_lock);
+
+        if ( ( command || op == dbQuery ) && ns == _lastDropped ) {
+            _lastDropped = "";
+            return;
+        }
+
+        CollectionData& coll = _usage[ns];
+        _record( coll , op , lockType , micros , command );
+        _record( _global , op , lockType , micros , command );
+    }
+
+    void Top::_record( CollectionData& c , int op , int lockType , long long micros , bool command ) {
+        c.total.inc( micros );
+
+        if ( lockType > 0 )
+            c.writeLock.inc( micros );
+        else if ( lockType < 0 )
+            c.readLock.inc( micros );
+
+        switch ( op ) {
+        case 0:
+            // use 0 for unknown, non-specific
+            break;
+        case dbUpdate:
+            c.update.inc( micros );
+            break;
+        case dbInsert:
+            c.insert.inc( micros );
+            break;
+        case dbQuery:
+            if ( command )
+                c.commands.inc( micros );
+            else
+                c.queries.inc( micros );
+            break;
+        case dbGetMore:
+            c.getmore.inc( micros );
+            break;
+        case dbDelete:
+            c.remove.inc( micros );
+            break;
+        case dbKillCursors:
+            break;
+        case opReply:
+        case dbMsg:
+            log() << "unexpected op in Top::record: " << op << endl;
+            break;
+        default:
+            log() << "unknown op in Top::record: " << op << endl;
+        }
+
+    }
+
+    void Top::collectionDropped( const string& ns ) {
+        //cout << "collectionDropped: " << ns << endl;
+        scoped_lock lk(_lock);
+        _usage.erase(ns);
+        _lastDropped = ns;
+    }
+
+    void Top::cloneMap(Top::UsageMap& out) const {
+        scoped_lock lk(_lock);
+        out = _usage;
+    }
+
+    void Top::append( BSONObjBuilder& b ) {
+        scoped_lock lk( _lock );
+        _appendToUsageMap( b , _usage );
+    }
+
+    void Top::_appendToUsageMap( BSONObjBuilder& b , const UsageMap& map ) const {
+        for ( UsageMap::const_iterator i=map.begin(); i!=map.end(); i++ ) {
+            BSONObjBuilder bb( b.subobjStart( i->first ) );
+
+            const CollectionData& coll = i->second;
+
+            _appendStatsEntry( b , "total" , coll.total );
+
+            _appendStatsEntry( b , "readLock" , coll.readLock );
+            _appendStatsEntry( b , "writeLock" , coll.writeLock );
+
+            _appendStatsEntry( b , "queries" , coll.queries );
+            _appendStatsEntry( b , "getmore" , coll.getmore );
+            _appendStatsEntry( b , "insert" , coll.insert );
+            _appendStatsEntry( b , "update" , coll.update );
+            _appendStatsEntry( b , "remove" , coll.remove );
+            _appendStatsEntry( b , "commands" , coll.commands );
+
+            bb.done();
+        }
+    }
+
+    void Top::_appendStatsEntry( BSONObjBuilder& b , const char * statsName , const UsageData& map ) const {
+        BSONObjBuilder bb( b.subobjStart( statsName ) );
+        bb.appendNumber( "time" , map.time );
+        bb.appendNumber( "count" , map.count );
+        bb.done();
+    }
+
+    class TopCmd : public Command {
+    public:
+        TopCmd() : Command( "top", true ) {}
+
+        virtual bool slaveOk() const { return true; }
+        virtual bool adminOnly() const { return true; }
+        virtual LockType locktype() const { return READ; }
+        virtual void help( stringstream& help ) const { help << "usage by collection, in micros "; }
+
+        virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
+            {
+                BSONObjBuilder b( result.subobjStart( "totals" ) );
+                b.append( "note" , "all times in microseconds" );
+                Top::global.append( b );
+                b.done();
+            }
+            return true;
+        }
+
+    } topCmd;
+
+    Top Top::global;
+
+    TopOld::T TopOld::_snapshotStart = TopOld::currentTime();
+    TopOld::D TopOld::_snapshotDuration;
+    TopOld::UsageMap TopOld::_totalUsage;
+    TopOld::UsageMap TopOld::_snapshotA;
+    TopOld::UsageMap TopOld::_snapshotB;
+    TopOld::UsageMap &TopOld::_snapshot = TopOld::_snapshotA;
+    TopOld::UsageMap &TopOld::_nextSnapshot = TopOld::_snapshotB;
+    mongo::mutex TopOld::topMutex("topMutex");
+
+
+}
diff --git a/src/mongo/db/stats/top.h b/src/mongo/db/stats/top.h
new file mode 100644
index 00000000000..9645ed1a3a6
--- /dev/null
+++ b/src/mongo/db/stats/top.h
@@ -0,0 +1,247 @@
+// top.h : DB usage monitor.
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#pragma once
+
+#include <boost/date_time/posix_time/posix_time.hpp>
+#undef assert
+#define assert MONGO_assert
+
+namespace mongo {
+
+    /**
+     * tracks usage by collection
+     */
+    class Top {
+
+    public:
+        Top() : _lock("Top") { }
+
+        struct UsageData {
+            UsageData() : time(0) , count(0) {}
+            UsageData( const UsageData& older , const UsageData& newer );
+            long long time;
+            long long count;
+
+            void inc( long long micros ) {
+                count++;
+                time += micros;
+            }
+        };
+
+        struct CollectionData {
+            /**
+             * constructs a diff
+             */
+            CollectionData() {}
+            CollectionData( const CollectionData& older , const CollectionData& newer );
+
+            UsageData total;
+
+            UsageData readLock;
+            UsageData writeLock;
+
+            UsageData queries;
+            UsageData getmore;
+            UsageData insert;
+            UsageData update;
+            UsageData remove;
+            UsageData commands;
+        };
+
+        typedef map<string,CollectionData> UsageMap;
+
+    public:
+        void record( const string& ns , int op , int lockType , long long micros , bool command );
+        void append( BSONObjBuilder& b );
+        void cloneMap(UsageMap& out) const;
+        CollectionData getGlobalData() const { return _global; }
+        void collectionDropped( const string& ns );
+
+    public: // static stuff
+        static Top global;
+
+    private:
+        void _appendToUsageMap( BSONObjBuilder& b , const UsageMap& map ) const;
+        void _appendStatsEntry( BSONObjBuilder& b , const char * statsName , const UsageData& map ) const;
+        void _record( CollectionData& c , int op , int lockType , long long micros , bool command );
+
+        mutable mongo::mutex _lock;
+        CollectionData _global;
+        UsageMap _usage;
+        string _lastDropped;
+    };
+
+    /* Records per namespace utilization of the mongod process.
+       No two functions of this class may be called concurrently.
+    */
+    class TopOld {
+        typedef boost::posix_time::ptime T;
+        typedef boost::posix_time::time_duration D;
+        typedef boost::tuple< D, int, int, int > UsageData;
+    public:
+        TopOld() : _read(false), _write(false) { }
+
+        /* these are used to record activity: */
+
+        void clientStart( const char *client ) {
+            clientStop();
+            _currentStart = currentTime();
+            _current = client;
+        }
+
+        /* indicate current request is a read operation. */
+        void setRead() { _read = true; }
+
+        void setWrite() { _write = true; }
+
+        void clientStop() {
+            if ( _currentStart == T() )
+                return;
+            D d = currentTime() - _currentStart;
+
+            {
+                scoped_lock L(topMutex);
+                recordUsage( _current, d );
+            }
+
+            _currentStart = T();
+            _read = false;
+            _write = false;
+        }
+
+        /* these are used to fetch the stats: */
+
+        struct Usage {
+            string ns;
+            D time;
+            double pct;
+            int reads, writes, calls;
+        };
+
+        static void usage( vector< Usage > &res ) {
+            scoped_lock L(topMutex);
+
+            // Populate parent namespaces
+            UsageMap snapshot;
+            UsageMap totalUsage;
+            fillParentNamespaces( snapshot, _snapshot );
+            fillParentNamespaces( totalUsage, _totalUsage );
+
+            multimap< D, string, more > sorted;
+            for( UsageMap::iterator i = snapshot.begin(); i != snapshot.end(); ++i )
+                sorted.insert( make_pair( i->second.get<0>(), i->first ) );
+            for( multimap< D, string, more >::iterator i = sorted.begin(); i != sorted.end(); ++i ) {
+                if ( trivialNs( i->second.c_str() ) )
+                    continue;
+                Usage u;
+                u.ns = i->second;
+                u.time = totalUsage[ u.ns ].get<0>();
+                u.pct = _snapshotDuration != D() ? 100.0 * i->first.ticks() / _snapshotDuration.ticks() : 0;
+                u.reads = snapshot[ u.ns ].get<1>();
+                u.writes = snapshot[ u.ns ].get<2>();
+                u.calls = snapshot[ u.ns ].get<3>();
+                res.push_back( u );
+            }
+            for( UsageMap::iterator i = totalUsage.begin(); i != totalUsage.end(); ++i ) {
+                if ( snapshot.count( i->first ) != 0 || trivialNs( i->first.c_str() ) )
+                    continue;
+                Usage u;
+                u.ns = i->first;
+                u.time = i->second.get<0>();
+                u.pct = 0;
+                u.reads = 0;
+                u.writes = 0;
+                u.calls = 0;
+                res.push_back( u );
+            }
+        }
+
+        static void completeSnapshot() {
+            scoped_lock L(topMutex);
+
+            if ( &_snapshot == &_snapshotA ) {
+                _snapshot = _snapshotB;
+                _nextSnapshot = _snapshotA;
+            }
+            else {
+                _snapshot = _snapshotA;
+                _nextSnapshot = _snapshotB;
+            }
+            _snapshotDuration = currentTime() - _snapshotStart;
+            _snapshotStart = currentTime();
+            _nextSnapshot.clear();
+        }
+
+    private:
+        static mongo::mutex topMutex;
+        static bool trivialNs( const char *ns ) {
+            const char *ret = strrchr( ns, '.' );
+            return ret && ret[ 1 ] == '\0';
+        }
+        typedef map<string,UsageData> UsageMap; // duration, # reads, # writes, # total calls
+        static T currentTime() {
+            return boost::posix_time::microsec_clock::universal_time();
+        }
+        void recordUsage( const string &client, D duration ) {
+            recordUsageForMap( _totalUsage, client, duration );
+            recordUsageForMap( _nextSnapshot, client, duration );
+        }
+        void recordUsageForMap( UsageMap &map, const string &client, D duration ) {
+            UsageData& g = map[client];
+            g.get< 0 >() += duration;
+            if ( _read && !_write )
+                g.get< 1 >()++;
+            else if ( !_read && _write )
+                g.get< 2 >()++;
+            g.get< 3 >()++;
+        }
+        static void fillParentNamespaces( UsageMap &to, const UsageMap &from ) {
+            for( UsageMap::const_iterator i = from.begin(); i != from.end(); ++i ) {
+                string current = i->first;
+                size_t dot = current.rfind( "." );
+                if ( dot == string::npos || dot != current.length() - 1 ) {
+                    inc( to[ current ], i->second );
+                }
+                while( dot != string::npos ) {
+                    current = current.substr( 0, dot );
+                    inc( to[ current ], i->second );
+                    dot = current.rfind( "." );
+                }
+            }
+        }
+        static void inc( UsageData &to, const UsageData &from ) {
+            to.get<0>() += from.get<0>();
+            to.get<1>() += from.get<1>();
+            to.get<2>() += from.get<2>();
+            to.get<3>() += from.get<3>();
+        }
+        struct more { bool operator()( const D &a, const D &b ) { return a > b; } };
+        string _current;
+        T _currentStart;
+        static T _snapshotStart;
+        static D _snapshotDuration;
+        static UsageMap _totalUsage;
+        static UsageMap _snapshotA;
+        static UsageMap _snapshotB;
+        static UsageMap &_snapshot;
+        static UsageMap &_nextSnapshot;
+        bool _read;
+        bool _write;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/taskqueue.h b/src/mongo/db/taskqueue.h
new file mode 100644
index 00000000000..005bd986f11
--- /dev/null
+++ b/src/mongo/db/taskqueue.h
@@ -0,0 +1,106 @@
+// @file deferredinvoker.h
+
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "mongomutex.h"
+
+namespace mongo {
+
+    /** defer work items by queueing them for invocation by another thread.  presumption is that
+        consumer thread is outside of locks more than the source thread.  Additional presumption
+        is that several objects or micro-tasks will be queued and that having a single thread
+        processing them in batch is hepful as they (in the first use case) use a common data
+        structure that can then be in local cpu classes.
+
+        this class is in db/ as it is dbMutex (mongomutex) specific (so far).
+
+        using a functor instead of go() might be more elegant too, once again, would like to test any
+        performance differential.  also worry that operator() hides things?
+
+        MT - copyable "micro task" object we can queue
+             must have a static method void MT::go(const MT&)
+
+        see DefInvoke in dbtests/ for an example.
+    */
+    template< class MT >
+    class TaskQueue {
+    public:
+        TaskQueue() : _which(0), _invokeMutex("deferredinvoker") { }
+
+        void defer(MT mt) {
+            // only one writer allowed.  however the invoke processing below can occur concurrently with
+            // writes (for the most part)
+            DEV d.dbMutex.assertWriteLocked();
+
+            _queues[_which].push_back(mt);
+        }
+
+        /** call to process deferrals.
+
+            concurrency: handled herein.  multiple threads could call invoke(), but their efforts will be
+                         serialized.  the common case is that there is a single processor calling invoke().
+
+            normally, you call this outside of any lock.  but if you want to fully drain the queue,
+            call from within a read lock.  for example:
+            {
+              // drain with minimal time in lock
+              d.invoke();
+              readlock lk;
+              d.invoke();
+              ...
+            }
+            you can also call invoke periodically to do some work and then pick up later on more.
+        */
+        void invoke() {
+            mutex::scoped_lock lk2(_invokeMutex);
+            int toDrain = 0;
+            {
+                // flip queueing to the other queue (we are double buffered)
+                readlocktry lk("", 5);
+                if( !lk.got() )
+                    return;
+                toDrain = _which;
+                _which = _which ^ 1;
+                wassert( _queues[_which].empty() ); // we are in dbMutex, so it should be/stay empty til we exit dbMutex
+            }
+
+            _drain( _queues[toDrain] );
+            assert( _queues[toDrain].empty() );
+        }
+
+    private:
+        int _which; // 0 or 1
+        typedef vector< MT > Queue;
+        Queue _queues[2];
+
+        // lock order when multiple locks: dbMutex, _invokeMutex
+        mongo::mutex _invokeMutex;
+
+        void _drain(Queue& queue) {
+            unsigned oldCap = queue.capacity();
+            for( typename Queue::iterator i = queue.begin(); i != queue.end(); i++ ) {
+                const MT& v = *i;
+                MT::go(v);
+            }
+            queue.clear();
+            DEV assert( queue.capacity() == oldCap ); // just checking that clear() doesn't deallocate, we don't want that
+        }
+    };
+
+}
diff --git a/src/mongo/db/tests.cpp b/src/mongo/db/tests.cpp
new file mode 100644
index 00000000000..00f299e1bb6
--- /dev/null
+++ b/src/mongo/db/tests.cpp
@@ -0,0 +1,68 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* tests.cpp
+
+   unit test & such
+*/
+
+#include "pch.h"
+#include "../util/mmap.h"
+
+namespace mongo {
+
+    int test2_old9() {
+        out() << "test2" << endl;
+        printStackTrace();
+        if ( 1 )
+            return 1;
+
+        MemoryMappedFile f;
+
+        unsigned long long len = 64*1024*1024;
+        char *p = (char *) f.map("/tmp/test.dat", len);
+        char *start = p;
+        char *end = p + 64*1024*1024-2;
+        end[1] = 'z';
+        int i;
+        while ( p < end ) {
+            *p++ = ' ';
+            if ( ++i%64 == 0 ) {
+                *p++ = '\n';
+                *p++ = 'x';
+            }
+        }
+        *p = 'a';
+
+        f.flush(true);
+        out() << "done" << endl;
+
+        char *x = start + 32 * 1024 * 1024;
+        char *y = start + 48 * 1024 * 1024;
+        char *z = start + 62 * 1024 * 1024;
+
+        strcpy(z, "zfoo");
+        out() << "y" << endl;
+        strcpy(y, "yfoo");
+        strcpy(x, "xfoo");
+        strcpy(start, "xfoo");
+
+        dbexit( EXIT_TEST );
+
+        return 1;
+    }
+
+} // namespace mongo
author	Eliot Horowitz <eliot@10gen.com>	2011-12-24 15:33:26 -0500
committer	Eliot Horowitz <eliot@10gen.com>	2011-12-24 15:33:45 -0500
commit	ae1ecd9c786911f9f1f0242f0f7d702b3e5dfeba (patch)
tree	92f8e1649e6f080b251ff5f1763679a72eb59b34 /src/mongo/db
parent	dfa4cd7e2cf109b072440155fabc08a93c8045a0 (diff)
download	mongo-ae1ecd9c786911f9f1f0242f0f7d702b3e5dfeba.tar.gz